2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 /* $OpenBSD: kern_watchdog.c,v 1.16 2022/08/14 01:58:27 jsg Exp $ */ /* * Copyright (c) 2003 Markus Friedl. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/timeout.h> #include <sys/sysctl.h> void wdog_tickle(void *arg); int (*wdog_ctl_cb)(void *, int) = NULL; void *wdog_ctl_cb_arg = NULL; int wdog_period = 0; int wdog_auto = 1; struct timeout wdog_timeout; void wdog_register(int (*cb)(void *, int), void *cb_arg) { if (wdog_ctl_cb != NULL) return; wdog_ctl_cb = cb; wdog_ctl_cb_arg = cb_arg; timeout_set(&wdog_timeout, wdog_tickle, NULL); } void wdog_tickle(void *arg) { if (wdog_ctl_cb == NULL) return; (void) (*wdog_ctl_cb)(wdog_ctl_cb_arg, wdog_period); timeout_add_msec(&wdog_timeout, wdog_period * 1000 / 2); } void wdog_shutdown(void *arg) { if (wdog_ctl_cb == NULL || wdog_ctl_cb_arg != arg) return; timeout_del(&wdog_timeout); (void) (*wdog_ctl_cb)(wdog_ctl_cb_arg, 0); wdog_ctl_cb = NULL; wdog_period = 0; wdog_auto = 1; } int sysctl_wdog(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { int error, period; if (wdog_ctl_cb == NULL) return (EOPNOTSUPP); switch (name[0]) { case KERN_WATCHDOG_PERIOD: period = wdog_period; error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &period, 0, INT_MAX); if (error) return (error); if (newp) { timeout_del(&wdog_timeout); wdog_period = (*wdog_ctl_cb)(wdog_ctl_cb_arg, period); } break; case KERN_WATCHDOG_AUTO: error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &wdog_auto, 0, 1); if (error) return (error); break; default: return (EINVAL); } if (wdog_auto && wdog_period > 0) { (void) (*wdog_ctl_cb)(wdog_ctl_cb_arg, wdog_period); timeout_add_msec(&wdog_timeout, wdog_period * 1000 / 2); } else timeout_del(&wdog_timeout); return (error); }
52 132 132 15 100 7 6 6 6 6 6 6 23 6 20 23 51 161 1850 1313 553 152 54 31 23 97 182 38 1851 1309 554 148 51 28 23 97 1792 1340 1339 564 139 72 57 15 52 12 67 95 731 732 1 5 1 729 728 5 438 433 24 432 7 437 75 74 74 73 1 1 148 4 142 4 12 8 88 92 116 28 4 25 37 37 32 32 28 5 82 82 75 6 5 73 5 79 79 79 8 8 1 7 7 7 97 97 59 16 4 12 47 36 16 29 7 58 67 5 127 130 131 3 3 3 3 3 3 3 29 29 3 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 /* $OpenBSD: uipc_socket2.c,v 1.128 2022/09/05 14:56:09 bluhm Exp $ */ /* $NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $ */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 */ #include <sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/domain.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/signalvar.h> #include <sys/event.h> #include <sys/pool.h> /* * Primitive routines for operating on sockets and socket buffers */ u_long sb_max = SB_MAX; /* patchable */ extern struct pool mclpools[]; extern struct pool mbpool; /* * Procedures to manipulate state flags of socket * and do appropriate wakeups. Normal sequence from the * active (originating) side is that soisconnecting() is * called during processing of connect() call, * resulting in an eventual call to soisconnected() if/when the * connection is established. When the connection is torn down * soisdisconnecting() is called during processing of disconnect() call, * and soisdisconnected() is called when the connection to the peer * is totally severed. The semantics of these routines are such that * connectionless protocols can call soisconnected() and soisdisconnected() * only, bypassing the in-progress calls when setting up a ``connection'' * takes no time. * * From the passive side, a socket is created with * two queues of sockets: so_q0 for connections in progress * and so_q for connections already made and awaiting user acceptance. * As a protocol is preparing incoming connections, it creates a socket * structure queued on so_q0 by calling sonewconn(). When the connection * is established, soisconnected() is called, and transfers the * socket structure to so_q, making it available to accept(). * * If a socket is closed with sockets on either * so_q0 or so_q, these sockets are dropped. * * If higher level protocols are implemented in * the kernel, the wakeups done here will sometimes * cause software-interrupt process scheduling. */ void soisconnecting(struct socket *so) { soassertlocked(so); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; } void soisconnected(struct socket *so) { struct socket *head = so->so_head; soassertlocked(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTED; if (head != NULL && so->so_onq == &head->so_q0) { int persocket = solock_persocket(so); if (persocket) { soref(so); soref(head); sounlock(so); solock(head); solock(so); if (so->so_onq != &head->so_q0) { sounlock(head); sorele(head); sorele(so); return; } sorele(head); sorele(so); } soqremque(so, 0); soqinsque(head, so, 1); sorwakeup(head); wakeup_one(&head->so_timeo); if (persocket) sounlock(head); } else { wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); } } void soisdisconnecting(struct socket *so) { soassertlocked(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); wakeup(&so->so_timeo); sowwakeup(so); sorwakeup(so); } void soisdisconnected(struct socket *so) { soassertlocked(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); wakeup(&so->so_timeo); sowwakeup(so); sorwakeup(so); } /* * When an attempt at a new connection is noted on a socket * which accepts connections, sonewconn is called. If the * connection is possible (subject to space constraints, etc.) * then we allocate a new structure, properly linked into the * data structure of the original socket, and return this. * Connstatus may be 0 or SS_ISCONNECTED. */ struct socket * sonewconn(struct socket *head, int connstatus) { struct socket *so; int persocket = solock_persocket(head); int error; /* * XXXSMP as long as `so' and `head' share the same lock, we * can call soreserve() and pr_attach() below w/o explicitly * locking `so'. */ soassertlocked(head); if (m_pool_used() > 95) return (NULL); if (head->so_qlen + head->so_q0len > head->so_qlimit * 3) return (NULL); so = soalloc(PR_NOWAIT | PR_ZERO); if (so == NULL) return (NULL); so->so_type = head->so_type; so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; so->so_state = head->so_state | SS_NOFDREF; so->so_proto = head->so_proto; so->so_timeo = head->so_timeo; so->so_euid = head->so_euid; so->so_ruid = head->so_ruid; so->so_egid = head->so_egid; so->so_rgid = head->so_rgid; so->so_cpid = head->so_cpid; /* * Lock order will be `head' -> `so' while these sockets are linked. */ if (persocket) solock(so); /* * Inherit watermarks but those may get clamped in low mem situations. */ if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { if (persocket) sounlock(so); pool_put(&socket_pool, so); return (NULL); } so->so_snd.sb_wat = head->so_snd.sb_wat; so->so_snd.sb_lowat = head->so_snd.sb_lowat; so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs; so->so_rcv.sb_wat = head->so_rcv.sb_wat; so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs; klist_init(&so->so_rcv.sb_sel.si_note, &socket_klistops, so); klist_init(&so->so_snd.sb_sel.si_note, &socket_klistops, so); sigio_init(&so->so_sigio); sigio_copy(&so->so_sigio, &head->so_sigio); soqinsque(head, so, 0); /* * We need to unlock `head' because PCB layer could release * solock() to enforce desired lock order. */ if (persocket) { head->so_newconn++; sounlock(head); } error = pru_attach(so, 0); if (persocket) { sounlock(so); solock(head); solock(so); if ((head->so_newconn--) == 0) { if ((head->so_state & SS_NEWCONN_WAIT) != 0) { head->so_state &= ~SS_NEWCONN_WAIT; wakeup(&head->so_newconn); } } } if (error) { soqremque(so, 0); if (persocket) sounlock(so); sigio_free(&so->so_sigio); klist_free(&so->so_rcv.sb_sel.si_note); klist_free(&so->so_snd.sb_sel.si_note); pool_put(&socket_pool, so); return (NULL); } if (connstatus) { so->so_state |= connstatus; soqremque(so, 0); soqinsque(head, so, 1); sorwakeup(head); wakeup(&head->so_timeo); } if (persocket) sounlock(so); return (so); } void soqinsque(struct socket *head, struct socket *so, int q) { soassertlocked(head); soassertlocked(so); KASSERT(so->so_onq == NULL); so->so_head = head; if (q == 0) { head->so_q0len++; so->so_onq = &head->so_q0; } else { head->so_qlen++; so->so_onq = &head->so_q; } TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); } int soqremque(struct socket *so, int q) { struct socket *head = so->so_head; soassertlocked(so); soassertlocked(head); if (q == 0) { if (so->so_onq != &head->so_q0) return (0); head->so_q0len--; } else { if (so->so_onq != &head->so_q) return (0); head->so_qlen--; } TAILQ_REMOVE(so->so_onq, so, so_qe); so->so_onq = NULL; so->so_head = NULL; return (1); } /* * Socantsendmore indicates that no more data will be sent on the * socket; it would normally be applied to a socket when the user * informs the system that no more data is to be sent, by the protocol * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data * will be received, and will normally be applied to the socket by a * protocol when it detects that the peer will send no more data. * Data queued for reading in the socket may yet be read. */ void socantsendmore(struct socket *so) { soassertlocked(so); so->so_state |= SS_CANTSENDMORE; sowwakeup(so); } void socantrcvmore(struct socket *so) { soassertlocked(so); so->so_state |= SS_CANTRCVMORE; sorwakeup(so); } void solock(struct socket *so) { switch (so->so_proto->pr_domain->dom_family) { case PF_INET: case PF_INET6: NET_LOCK(); break; default: rw_enter_write(&so->so_lock); break; } } void solock_shared(struct socket *so) { switch (so->so_proto->pr_domain->dom_family) { case PF_INET: case PF_INET6: if (so->so_proto->pr_usrreqs->pru_lock != NULL) { NET_LOCK_SHARED(); pru_lock(so); } else NET_LOCK(); break; default: rw_enter_write(&so->so_lock); break; } } int solock_persocket(struct socket *so) { switch (so->so_proto->pr_domain->dom_family) { case PF_INET: case PF_INET6: return 0; default: return 1; } } void solock_pair(struct socket *so1, struct socket *so2) { KASSERT(so1 != so2); KASSERT(so1->so_type == so2->so_type); KASSERT(solock_persocket(so1)); if (so1 < so2) { solock(so1); solock(so2); } else { solock(so2); solock(so1); } } void sounlock(struct socket *so) { switch (so->so_proto->pr_domain->dom_family) { case PF_INET: case PF_INET6: NET_UNLOCK(); break; default: rw_exit_write(&so->so_lock); break; } } void sounlock_shared(struct socket *so) { switch (so->so_proto->pr_domain->dom_family) { case PF_INET: case PF_INET6: if (so->so_proto->pr_usrreqs->pru_unlock != NULL) { pru_unlock(so); NET_UNLOCK_SHARED(); } else NET_UNLOCK(); break; default: rw_exit_write(&so->so_lock); break; } } void soassertlocked(struct socket *so) { switch (so->so_proto->pr_domain->dom_family) { case PF_INET: case PF_INET6: NET_ASSERT_LOCKED(); break; default: rw_assert_wrlock(&so->so_lock); break; } } int sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg, uint64_t nsecs) { int ret; switch (so->so_proto->pr_domain->dom_family) { case PF_INET: case PF_INET6: if (so->so_proto->pr_usrreqs->pru_unlock != NULL && rw_status(&netlock) == RW_READ) { pru_unlock(so); } ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); if (so->so_proto->pr_usrreqs->pru_lock != NULL && rw_status(&netlock) == RW_READ) { pru_lock(so); } break; default: ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs); break; } return ret; } /* * Wait for data to arrive at/drain from a socket buffer. */ int sbwait(struct socket *so, struct sockbuf *sb) { int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; soassertlocked(so); sb->sb_flags |= SB_WAIT; return sosleep_nsec(so, &sb->sb_cc, prio, "netio", sb->sb_timeo_nsecs); } int sblock(struct socket *so, struct sockbuf *sb, int wait) { int error, prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; soassertlocked(so); if ((sb->sb_flags & SB_LOCK) == 0) { sb->sb_flags |= SB_LOCK; return (0); } if (wait & M_NOWAIT) return (EWOULDBLOCK); while (sb->sb_flags & SB_LOCK) { sb->sb_flags |= SB_WANT; error = sosleep_nsec(so, &sb->sb_flags, prio, "netlck", INFSLP); if (error) return (error); } sb->sb_flags |= SB_LOCK; return (0); } void sbunlock(struct socket *so, struct sockbuf *sb) { soassertlocked(so); sb->sb_flags &= ~SB_LOCK; if (sb->sb_flags & SB_WANT) { sb->sb_flags &= ~SB_WANT; wakeup(&sb->sb_flags); } } /* * Wakeup processes waiting on a socket buffer. * Do asynchronous notification via SIGIO * if the socket buffer has the SB_ASYNC flag set. */ void sowakeup(struct socket *so, struct sockbuf *sb) { soassertlocked(so); if (sb->sb_flags & SB_WAIT) { sb->sb_flags &= ~SB_WAIT; wakeup(&sb->sb_cc); } if (sb->sb_flags & SB_ASYNC) pgsigio(&so->so_sigio, SIGIO, 0); KNOTE(&sb->sb_sel.si_note, 0); } /* * Socket buffer (struct sockbuf) utility routines. * * Each socket contains two socket buffers: one for sending data and * one for receiving data. Each buffer contains a queue of mbufs, * information about the number of mbufs and amount of data in the * queue, and other fields allowing select() statements and notification * on data availability to be implemented. * * Data stored in a socket buffer is maintained as a list of records. * Each record is a list of mbufs chained together with the m_next * field. Records are chained together with the m_nextpkt field. The upper * level routine soreceive() expects the following conventions to be * observed when placing information in the receive buffer: * * 1. If the protocol requires each message be preceded by the sender's * name, then a record containing that name must be present before * any associated data (mbuf's must be of type MT_SONAME). * 2. If the protocol supports the exchange of ``access rights'' (really * just additional data associated with the message), and there are * ``rights'' to be received, then a record containing this data * should be present (mbuf's must be of type MT_CONTROL). * 3. If a name or rights record exists, then it must be followed by * a data record, perhaps of zero length. * * Before using a new socket structure it is first necessary to reserve * buffer space to the socket, by calling sbreserve(). This should commit * some of the available buffer space in the system buffer pool for the * socket (currently, it does nothing but enforce limits). The space * should be released by calling sbrelease() when the socket is destroyed. */ int soreserve(struct socket *so, u_long sndcc, u_long rcvcc) { soassertlocked(so); if (sbreserve(so, &so->so_snd, sndcc)) goto bad; if (sbreserve(so, &so->so_rcv, rcvcc)) goto bad2; so->so_snd.sb_wat = sndcc; so->so_rcv.sb_wat = rcvcc; if (so->so_rcv.sb_lowat == 0) so->so_rcv.sb_lowat = 1; if (so->so_snd.sb_lowat == 0) so->so_snd.sb_lowat = MCLBYTES; if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) so->so_snd.sb_lowat = so->so_snd.sb_hiwat; return (0); bad2: sbrelease(so, &so->so_snd); bad: return (ENOBUFS); } /* * Allot mbufs to a sockbuf. * Attempt to scale mbmax so that mbcnt doesn't become limiting * if buffering efficiency is near the normal case. */ int sbreserve(struct socket *so, struct sockbuf *sb, u_long cc) { KASSERT(sb == &so->so_rcv || sb == &so->so_snd); soassertlocked(so); if (cc == 0 || cc > sb_max) return (1); sb->sb_hiwat = cc; sb->sb_mbmax = max(3 * MAXMCLBYTES, cc * 8); if (sb->sb_lowat > sb->sb_hiwat) sb->sb_lowat = sb->sb_hiwat; return (0); } /* * In low memory situation, do not accept any greater than normal request. */ int sbcheckreserve(u_long cnt, u_long defcnt) { if (cnt > defcnt && sbchecklowmem()) return (ENOBUFS); return (0); } int sbchecklowmem(void) { static int sblowmem; unsigned int used = m_pool_used(); if (used < 60) sblowmem = 0; else if (used > 80) sblowmem = 1; return (sblowmem); } /* * Free mbufs held by a socket, and reserved mbuf space. */ void sbrelease(struct socket *so, struct sockbuf *sb) { sbflush(so, sb); sb->sb_hiwat = sb->sb_mbmax = 0; } /* * Routines to add and remove * data from an mbuf queue. * * The routines sbappend() or sbappendrecord() are normally called to * append new mbufs to a socket buffer, after checking that adequate * space is available, comparing the function sbspace() with the amount * of data to be added. sbappendrecord() differs from sbappend() in * that data supplied is treated as the beginning of a new record. * To place a sender's address, optional access rights, and data in a * socket receive buffer, sbappendaddr() should be used. To place * access rights and data in a socket receive buffer, sbappendrights() * should be used. In either case, the new data begins a new record. * Note that unlike sbappend() and sbappendrecord(), these routines check * for the caller that there will be enough space to store the data. * Each fails if there is not enough space, or if it cannot find mbufs * to store additional information in. * * Reliable protocols may use the socket send buffer to hold data * awaiting acknowledgement. Data is normally copied from a socket * send buffer in a protocol with m_copym for output to a peer, * and then removing the data from the socket buffer with sbdrop() * or sbdroprecord() when the data is acknowledged by the peer. */ #ifdef SOCKBUF_DEBUG void sblastrecordchk(struct sockbuf *sb, const char *where) { struct mbuf *m = sb->sb_mb; while (m && m->m_nextpkt) m = m->m_nextpkt; if (m != sb->sb_lastrecord) { printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", sb->sb_mb, sb->sb_lastrecord, m); printf("packet chain:\n"); for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) printf("\t%p\n", m); panic("sblastrecordchk from %s", where); } } void sblastmbufchk(struct sockbuf *sb, const char *where) { struct mbuf *m = sb->sb_mb; struct mbuf *n; while (m && m->m_nextpkt) m = m->m_nextpkt; while (m && m->m_next) m = m->m_next; if (m != sb->sb_mbtail) { printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", sb->sb_mb, sb->sb_mbtail, m); printf("packet tree:\n"); for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { printf("\t"); for (n = m; n != NULL; n = n->m_next) printf("%p ", n); printf("\n"); } panic("sblastmbufchk from %s", where); } } #endif /* SOCKBUF_DEBUG */ #define SBLINKRECORD(sb, m0) \ do { \ if ((sb)->sb_lastrecord != NULL) \ (sb)->sb_lastrecord->m_nextpkt = (m0); \ else \ (sb)->sb_mb = (m0); \ (sb)->sb_lastrecord = (m0); \ } while (/*CONSTCOND*/0) /* * Append mbuf chain m to the last record in the * socket buffer sb. The additional space associated * the mbuf chain is recorded in sb. Empty mbufs are * discarded and mbufs are compacted where possible. */ void sbappend(struct socket *so, struct sockbuf *sb, struct mbuf *m) { struct mbuf *n; if (m == NULL) return; soassertlocked(so); SBLASTRECORDCHK(sb, "sbappend 1"); if ((n = sb->sb_lastrecord) != NULL) { /* * XXX Would like to simply use sb_mbtail here, but * XXX I need to verify that I won't miss an EOR that * XXX way. */ do { if (n->m_flags & M_EOR) { sbappendrecord(so, sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); } else { /* * If this is the first record in the socket buffer, it's * also the last record. */ sb->sb_lastrecord = m; } sbcompress(so, sb, m, n); SBLASTRECORDCHK(sb, "sbappend 2"); } /* * This version of sbappend() should only be used when the caller * absolutely knows that there will never be more than one record * in the socket buffer, that is, a stream protocol (such as TCP). */ void sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m) { KASSERT(sb == &so->so_rcv || sb == &so->so_snd); soassertlocked(so); KDASSERT(m->m_nextpkt == NULL); KASSERT(sb->sb_mb == sb->sb_lastrecord); SBLASTMBUFCHK(sb, __func__); sbcompress(so, sb, m, sb->sb_mbtail); sb->sb_lastrecord = sb->sb_mb; SBLASTRECORDCHK(sb, __func__); } #ifdef SOCKBUF_DEBUG void sbcheck(struct socket *so, struct sockbuf *sb) { struct mbuf *m, *n; u_long len = 0, mbcnt = 0; for (m = sb->sb_mb; m; m = m->m_nextpkt) { for (n = m; n; n = n->m_next) { len += n->m_len; mbcnt += MSIZE; if (n->m_flags & M_EXT) mbcnt += n->m_ext.ext_size; if (m != n && n->m_nextpkt) panic("sbcheck nextpkt"); } } if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, mbcnt, sb->sb_mbcnt); panic("sbcheck"); } } #endif /* * As above, except the mbuf chain * begins a new record. */ void sbappendrecord(struct socket *so, struct sockbuf *sb, struct mbuf *m0) { struct mbuf *m; KASSERT(sb == &so->so_rcv || sb == &so->so_snd); soassertlocked(so); if (m0 == NULL) return; /* * Put the first mbuf on the queue. * Note this permits zero length records. */ sballoc(so, sb, m0); SBLASTRECORDCHK(sb, "sbappendrecord 1"); SBLINKRECORD(sb, m0); m = m0->m_next; m0->m_next = NULL; if (m && (m0->m_flags & M_EOR)) { m0->m_flags &= ~M_EOR; m->m_flags |= M_EOR; } sbcompress(so, sb, m, m0); SBLASTRECORDCHK(sb, "sbappendrecord 2"); } /* * Append address and data, and optionally, control (ancillary) data * to the receive queue of a socket. If present, * m0 must include a packet header with total length. * Returns 0 if no space in sockbuf or insufficient mbufs. */ int sbappendaddr(struct socket *so, struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { struct mbuf *m, *n, *nlast; int space = asa->sa_len; soassertlocked(so); if (m0 && (m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddr"); if (m0) space += m0->m_pkthdr.len; for (n = control; n; n = n->m_next) { space += n->m_len; if (n->m_next == NULL) /* keep pointer to last control buf */ break; } if (space > sbspace(so, sb)) return (0); if (asa->sa_len > MLEN) return (0); MGET(m, M_DONTWAIT, MT_SONAME); if (m == NULL) return (0); m->m_len = asa->sa_len; memcpy(mtod(m, caddr_t), asa, asa->sa_len); if (n) n->m_next = m0; /* concatenate data to control */ else control = m0; m->m_next = control; SBLASTRECORDCHK(sb, "sbappendaddr 1"); for (n = m; n->m_next != NULL; n = n->m_next) sballoc(so, sb, n); sballoc(so, sb, n); nlast = n; SBLINKRECORD(sb, m); sb->sb_mbtail = nlast; SBLASTMBUFCHK(sb, "sbappendaddr"); SBLASTRECORDCHK(sb, "sbappendaddr 2"); return (1); } int sbappendcontrol(struct socket *so, struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) { struct mbuf *m, *mlast, *n; int space = 0; if (control == NULL) panic("sbappendcontrol"); for (m = control; ; m = m->m_next) { space += m->m_len; if (m->m_next == NULL) break; } n = m; /* save pointer to last control buffer */ for (m = m0; m; m = m->m_next) space += m->m_len; if (space > sbspace(so, sb)) return (0); n->m_next = m0; /* concatenate data to control */ SBLASTRECORDCHK(sb, "sbappendcontrol 1"); for (m = control; m->m_next != NULL; m = m->m_next) sballoc(so, sb, m); sballoc(so, sb, m); mlast = m; SBLINKRECORD(sb, control); sb->sb_mbtail = mlast; SBLASTMBUFCHK(sb, "sbappendcontrol"); SBLASTRECORDCHK(sb, "sbappendcontrol 2"); return (1); } /* * Compress mbuf chain m into the socket * buffer sb following mbuf n. If n * is null, the buffer is presumed empty. */ void sbcompress(struct socket *so, struct sockbuf *sb, struct mbuf *m, struct mbuf *n) { int eor = 0; struct mbuf *o; while (m) { eor |= m->m_flags & M_EOR; if (m->m_len == 0 && (eor == 0 || (((o = m->m_next) || (o = n)) && o->m_type == m->m_type))) { if (sb->sb_lastrecord == m) sb->sb_lastrecord = m->m_next; m = m_free(m); continue; } if (n && (n->m_flags & M_EOR) == 0 && /* m_trailingspace() checks buffer writeability */ m->m_len <= ((n->m_flags & M_EXT)? n->m_ext.ext_size : MCLBYTES) / 4 && /* XXX Don't copy too much */ m->m_len <= m_trailingspace(n) && n->m_type == m->m_type) { memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t), m->m_len); n->m_len += m->m_len; sb->sb_cc += m->m_len; if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) sb->sb_datacc += m->m_len; m = m_free(m); continue; } if (n) n->m_next = m; else sb->sb_mb = m; sb->sb_mbtail = m; sballoc(so, sb, m); n = m; m->m_flags &= ~M_EOR; m = m->m_next; n->m_next = NULL; } if (eor) { if (n) n->m_flags |= eor; else printf("semi-panic: sbcompress"); } SBLASTMBUFCHK(sb, __func__); } /* * Free all mbufs in a sockbuf. * Check that all resources are reclaimed. */ void sbflush(struct socket *so, struct sockbuf *sb) { KASSERT(sb == &so->so_rcv || sb == &so->so_snd); KASSERT((sb->sb_flags & SB_LOCK) == 0); while (sb->sb_mbcnt) sbdrop(so, sb, (int)sb->sb_cc); KASSERT(sb->sb_cc == 0); KASSERT(sb->sb_datacc == 0); KASSERT(sb->sb_mb == NULL); KASSERT(sb->sb_mbtail == NULL); KASSERT(sb->sb_lastrecord == NULL); } /* * Drop data from (the front of) a sockbuf. */ void sbdrop(struct socket *so, struct sockbuf *sb, int len) { struct mbuf *m, *mn; struct mbuf *next; KASSERT(sb == &so->so_rcv || sb == &so->so_snd); soassertlocked(so); next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; while (len > 0) { if (m == NULL) { if (next == NULL) panic("sbdrop"); m = next; next = m->m_nextpkt; continue; } if (m->m_len > len) { m->m_len -= len; m->m_data += len; sb->sb_cc -= len; if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) sb->sb_datacc -= len; break; } len -= m->m_len; sbfree(so, sb, m); mn = m_free(m); m = mn; } while (m && m->m_len == 0) { sbfree(so, sb, m); mn = m_free(m); m = mn; } if (m) { sb->sb_mb = m; m->m_nextpkt = next; } else sb->sb_mb = next; /* * First part is an inline SB_EMPTY_FIXUP(). Second part * makes sure sb_lastrecord is up-to-date if we dropped * part of the last record. */ m = sb->sb_mb; if (m == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (m->m_nextpkt == NULL) sb->sb_lastrecord = m; } /* * Drop a record off the front of a sockbuf * and move the next record to the front. */ void sbdroprecord(struct socket *so, struct sockbuf *sb) { struct mbuf *m, *mn; m = sb->sb_mb; if (m) { sb->sb_mb = m->m_nextpkt; do { sbfree(so, sb, m); mn = m_free(m); } while ((m = mn) != NULL); } SB_EMPTY_FIXUP(sb); } /* * Create a "control" mbuf containing the specified data * with the specified type for presentation on a socket buffer. */ struct mbuf * sbcreatecontrol(const void *p, size_t size, int type, int level) { struct cmsghdr *cp; struct mbuf *m; if (CMSG_SPACE(size) > MCLBYTES) { printf("sbcreatecontrol: message too large %zu\n", size); return (NULL); } if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) return (NULL); if (CMSG_SPACE(size) > MLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return NULL; } } cp = mtod(m, struct cmsghdr *); memset(cp, 0, CMSG_SPACE(size)); memcpy(CMSG_DATA(cp), p, size); m->m_len = CMSG_SPACE(size); cp->cmsg_len = CMSG_LEN(size); cp->cmsg_level = level; cp->cmsg_type = type; return (m); }
43 43 4 16 2 20 1 17 21 21 21 14 7 21 42 22 10 11 21 11 10 9 2 9 11 11 47 6 2 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 /* $OpenBSD: ip_icmp.c,v 1.191 2022/05/05 13:57:40 claudio Exp $ */ /* $NetBSD: ip_icmp.c,v 1.19 1996/02/13 23:42:22 christos Exp $ */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 * * NRL grants permission for redistribution and use in source and binary * forms, with or without modification, of the software and documentation * created at NRL provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgements: * This product includes software developed by the University of * California, Berkeley and its contributors. * This product includes software developed at the Information * Technology Division, US Naval Research Laboratory. * 4. Neither the name of the NRL nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * The views and conclusions contained in the software and documentation * are those of the authors and should not be interpreted as representing * official policies, either expressed or implied, of the US Naval * Research Laboratory (NRL). */ #include "carp.h" #include "pf.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/sysctl.h> #include <net/if.h> #include <net/if_var.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> #include <netinet/ip.h> #include <netinet/ip_icmp.h> #include <netinet/ip_var.h> #include <netinet/icmp_var.h> #if NCARP > 0 #include <net/if_types.h> #include <netinet/ip_carp.h> #endif #if NPF > 0 #include <net/pfvar.h> #endif /* * ICMP routines: error generation, receive packet processing, and * routines to turnaround packets back to the originator, and * host table maintenance routines. */ #ifdef ICMPPRINTFS int icmpprintfs = 0; /* Settable from ddb */ #endif /* values controllable via sysctl */ int icmpmaskrepl = 0; int icmpbmcastecho = 0; int icmptstamprepl = 1; int icmperrppslim = 100; int icmp_rediraccept = 0; int icmp_redirtimeout = 10 * 60; static int icmperrpps_count = 0; static struct timeval icmperrppslim_last; struct rttimer_queue ip_mtudisc_timeout_q; struct rttimer_queue icmp_redirect_timeout_q; struct cpumem *icmpcounters; const struct sysctl_bounded_args icmpctl_vars[] = { { ICMPCTL_MASKREPL, &icmpmaskrepl, 0, 1 }, { ICMPCTL_BMCASTECHO, &icmpbmcastecho, 0, 1 }, { ICMPCTL_ERRPPSLIMIT, &icmperrppslim, -1, INT_MAX }, { ICMPCTL_REDIRACCEPT, &icmp_rediraccept, 0, 1 }, { ICMPCTL_TSTAMPREPL, &icmptstamprepl, 0, 1 }, }; void icmp_mtudisc_timeout(struct rtentry *, u_int); int icmp_ratelimit(const struct in_addr *, const int, const int); int icmp_input_if(struct ifnet *, struct mbuf **, int *, int, int); int icmp_sysctl_icmpstat(void *, size_t *, void *); void icmp_init(void) { rt_timer_queue_init(&ip_mtudisc_timeout_q, ip_mtudisc_timeout, &icmp_mtudisc_timeout); rt_timer_queue_init(&icmp_redirect_timeout_q, icmp_redirtimeout, NULL); icmpcounters = counters_alloc(icps_ncounters); } struct mbuf * icmp_do_error(struct mbuf *n, int type, int code, u_int32_t dest, int destmtu) { struct ip *oip = mtod(n, struct ip *), *nip; unsigned oiplen = oip->ip_hl << 2; struct icmp *icp; struct mbuf *m; unsigned icmplen, mblen; #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_error(%x, %d, %d)\n", oip, type, code); #endif if (type != ICMP_REDIRECT) icmpstat_inc(icps_error); /* * Don't send error if not the first fragment of message. * Don't error if the old packet protocol was ICMP * error message, only known informational types. */ if (oip->ip_off & htons(IP_OFFMASK)) goto freeit; if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && n->m_len >= oiplen + ICMP_MINLEN && !ICMP_INFOTYPE(((struct icmp *) ((caddr_t)oip + oiplen))->icmp_type)) { icmpstat_inc(icps_oldicmp); goto freeit; } /* Don't send error in response to a multicast or broadcast packet */ if (n->m_flags & (M_BCAST|M_MCAST)) goto freeit; /* * First, do a rate limitation check. */ if (icmp_ratelimit(&oip->ip_src, type, code)) { icmpstat_inc(icps_toofreq); goto freeit; } /* * Now, formulate icmp message */ icmplen = oiplen + min(8, ntohs(oip->ip_len)); /* * Defend against mbuf chains shorter than oip->ip_len: */ mblen = 0; for (m = n; m && (mblen < icmplen); m = m->m_next) mblen += m->m_len; icmplen = min(mblen, icmplen); /* * As we are not required to return everything we have, * we return whatever we can return at ease. * * Note that ICMP datagrams longer than 576 octets are out of spec * according to RFC1812; */ KASSERT(ICMP_MINLEN + sizeof (struct ip) <= MCLBYTES); if (sizeof (struct ip) + icmplen + ICMP_MINLEN > MCLBYTES) icmplen = MCLBYTES - ICMP_MINLEN - sizeof (struct ip); m = m_gethdr(M_DONTWAIT, MT_HEADER); if (m && ((sizeof (struct ip) + icmplen + ICMP_MINLEN + sizeof(long) - 1) &~ (sizeof(long) - 1)) > MHLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m = NULL; } } if (m == NULL) goto freeit; /* keep in same rtable and preserve other pkthdr bits */ m->m_pkthdr.ph_rtableid = n->m_pkthdr.ph_rtableid; m->m_pkthdr.ph_ifidx = n->m_pkthdr.ph_ifidx; /* move PF_GENERATED to new packet, if existent XXX preserve more? */ if (n->m_pkthdr.pf.flags & PF_TAG_GENERATED) m->m_pkthdr.pf.flags |= PF_TAG_GENERATED; m->m_pkthdr.len = m->m_len = icmplen + ICMP_MINLEN; m_align(m, m->m_len); icp = mtod(m, struct icmp *); if ((u_int)type > ICMP_MAXTYPE) panic("icmp_error"); icmpstat_inc(icps_outhist + type); icp->icmp_type = type; if (type == ICMP_REDIRECT) icp->icmp_gwaddr.s_addr = dest; else { icp->icmp_void = 0; /* * The following assignments assume an overlay with the * zeroed icmp_void field. */ if (type == ICMP_PARAMPROB) { icp->icmp_pptr = code; code = 0; } else if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG && destmtu) icp->icmp_nextmtu = htons(destmtu); } icp->icmp_code = code; m_copydata(n, 0, icmplen, &icp->icmp_ip); /* * Now, copy old ip header (without options) * in front of icmp message. */ m = m_prepend(m, sizeof(struct ip), M_DONTWAIT); if (m == NULL) goto freeit; nip = mtod(m, struct ip *); /* ip_v set in ip_output */ nip->ip_hl = sizeof(struct ip) >> 2; nip->ip_tos = 0; nip->ip_len = htons(m->m_len); /* ip_id set in ip_output */ nip->ip_off = 0; /* ip_ttl set in icmp_reflect */ nip->ip_p = IPPROTO_ICMP; nip->ip_src = oip->ip_src; nip->ip_dst = oip->ip_dst; m_freem(n); return (m); freeit: m_freem(n); return (NULL); } /* * Generate an error packet of type error * in response to bad packet ip. * * The ip packet inside has ip_off and ip_len in host byte order. */ void icmp_error(struct mbuf *n, int type, int code, u_int32_t dest, int destmtu) { struct mbuf *m; m = icmp_do_error(n, type, code, dest, destmtu); if (m != NULL) if (!icmp_reflect(m, NULL, NULL)) icmp_send(m, NULL); } /* * Process a received ICMP message. */ int icmp_input(struct mbuf **mp, int *offp, int proto, int af) { struct ifnet *ifp; ifp = if_get((*mp)->m_pkthdr.ph_ifidx); if (ifp == NULL) { m_freemp(mp); return IPPROTO_DONE; } proto = icmp_input_if(ifp, mp, offp, proto, af); if_put(ifp); return proto; } int icmp_input_if(struct ifnet *ifp, struct mbuf **mp, int *offp, int proto, int af) { struct mbuf *m = *mp; int hlen = *offp; struct icmp *icp; struct ip *ip = mtod(m, struct ip *); struct sockaddr_in sin; int icmplen, i, code; struct in_ifaddr *ia; void (*ctlfunc)(int, struct sockaddr *, u_int, void *); struct mbuf *opts; /* * Locate icmp structure in mbuf, and check * that not corrupted and of at least minimum length. */ icmplen = ntohs(ip->ip_len) - hlen; #ifdef ICMPPRINTFS if (icmpprintfs) { char dst[INET_ADDRSTRLEN], src[INET_ADDRSTRLEN]; inet_ntop(AF_INET, &ip->ip_dst, dst, sizeof(dst)); inet_ntop(AF_INET, &ip->ip_src, src, sizeof(src)); printf("icmp_input from %s to %s, len %d\n", src, dst, icmplen); } #endif if (icmplen < ICMP_MINLEN) { icmpstat_inc(icps_tooshort); goto freeit; } i = hlen + min(icmplen, ICMP_ADVLENMAX); if ((m = *mp = m_pullup(m, i)) == NULL) { icmpstat_inc(icps_tooshort); return IPPROTO_DONE; } ip = mtod(m, struct ip *); if (in4_cksum(m, 0, hlen, icmplen)) { icmpstat_inc(icps_checksum); goto freeit; } icp = (struct icmp *)(mtod(m, caddr_t) + hlen); #ifdef ICMPPRINTFS /* * Message type specific processing. */ if (icmpprintfs) printf("icmp_input, type %d code %d\n", icp->icmp_type, icp->icmp_code); #endif if (icp->icmp_type > ICMP_MAXTYPE) goto raw; #if NPF > 0 if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { switch (icp->icmp_type) { /* * As pf_icmp_mapping() considers redirects belonging to a * diverted connection, we must include it here. */ case ICMP_REDIRECT: /* FALLTHROUGH */ /* * These ICMP types map to other connections. They must be * delivered to pr_ctlinput() also for diverted connections. */ case ICMP_UNREACH: case ICMP_TIMXCEED: case ICMP_PARAMPROB: case ICMP_SOURCEQUENCH: /* * Do not use the divert-to property of the TCP or UDP * rule when doing the PCB lookup for the raw socket. */ m->m_pkthdr.pf.flags &=~ PF_TAG_DIVERTED; break; default: goto raw; } } #endif /* NPF */ icmpstat_inc(icps_inhist + icp->icmp_type); code = icp->icmp_code; switch (icp->icmp_type) { case ICMP_UNREACH: switch (code) { case ICMP_UNREACH_NET: case ICMP_UNREACH_HOST: case ICMP_UNREACH_PROTOCOL: case ICMP_UNREACH_PORT: case ICMP_UNREACH_SRCFAIL: code += PRC_UNREACH_NET; break; case ICMP_UNREACH_NEEDFRAG: code = PRC_MSGSIZE; break; case ICMP_UNREACH_NET_UNKNOWN: case ICMP_UNREACH_NET_PROHIB: case ICMP_UNREACH_TOSNET: code = PRC_UNREACH_NET; break; case ICMP_UNREACH_HOST_UNKNOWN: case ICMP_UNREACH_ISOLATED: case ICMP_UNREACH_HOST_PROHIB: case ICMP_UNREACH_TOSHOST: case ICMP_UNREACH_FILTER_PROHIB: case ICMP_UNREACH_HOST_PRECEDENCE: case ICMP_UNREACH_PRECEDENCE_CUTOFF: code = PRC_UNREACH_HOST; break; default: goto badcode; } goto deliver; case ICMP_TIMXCEED: if (code > 1) goto badcode; code += PRC_TIMXCEED_INTRANS; goto deliver; case ICMP_PARAMPROB: if (code > 1) goto badcode; code = PRC_PARAMPROB; goto deliver; case ICMP_SOURCEQUENCH: if (code) goto badcode; code = PRC_QUENCH; deliver: /* * Problem with datagram; advise higher level routines. */ if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { icmpstat_inc(icps_badlen); goto freeit; } if (IN_MULTICAST(icp->icmp_ip.ip_dst.s_addr)) goto badcode; #ifdef INET6 /* Get more contiguous data for a v6 in v4 ICMP message. */ if (icp->icmp_ip.ip_p == IPPROTO_IPV6) { if (icmplen < ICMP_V6ADVLENMIN || icmplen < ICMP_V6ADVLEN(icp)) { icmpstat_inc(icps_badlen); goto freeit; } } #endif /* INET6 */ #ifdef ICMPPRINTFS if (icmpprintfs) printf("deliver to protocol %d\n", icp->icmp_ip.ip_p); #endif memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_addr = icp->icmp_ip.ip_dst; #if NCARP > 0 if (carp_lsdrop(ifp, m, AF_INET, &sin.sin_addr.s_addr, &ip->ip_dst.s_addr, 1)) goto freeit; #endif /* * XXX if the packet contains [IPv4 AH TCP], we can't make a * notification to TCP layer. */ ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput; if (ctlfunc) (*ctlfunc)(code, sintosa(&sin), m->m_pkthdr.ph_rtableid, &icp->icmp_ip); break; badcode: icmpstat_inc(icps_badcode); break; case ICMP_ECHO: if (!icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { icmpstat_inc(icps_bmcastecho); break; } icp->icmp_type = ICMP_ECHOREPLY; goto reflect; case ICMP_TSTAMP: if (icmptstamprepl == 0) break; if (!icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { icmpstat_inc(icps_bmcastecho); break; } if (icmplen < ICMP_TSLEN) { icmpstat_inc(icps_badlen); break; } icp->icmp_type = ICMP_TSTAMPREPLY; icp->icmp_rtime = iptime(); icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */ goto reflect; case ICMP_MASKREQ: if (icmpmaskrepl == 0) break; if (icmplen < ICMP_MASKLEN) { icmpstat_inc(icps_badlen); break; } /* * We are not able to respond with all ones broadcast * unless we receive it over a point-to-point interface. */ memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); if (ip->ip_dst.s_addr == INADDR_BROADCAST || ip->ip_dst.s_addr == INADDR_ANY) sin.sin_addr = ip->ip_src; else sin.sin_addr = ip->ip_dst; if (ifp == NULL) break; ia = ifatoia(ifaof_ifpforaddr(sintosa(&sin), ifp)); if (ia == NULL) break; icp->icmp_type = ICMP_MASKREPLY; icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr; if (ip->ip_src.s_addr == 0) { if (ifp->if_flags & IFF_BROADCAST) { if (ia->ia_broadaddr.sin_addr.s_addr) ip->ip_src = ia->ia_broadaddr.sin_addr; else ip->ip_src.s_addr = INADDR_BROADCAST; } else if (ifp->if_flags & IFF_POINTOPOINT) ip->ip_src = ia->ia_dstaddr.sin_addr; } reflect: #if NCARP > 0 if (carp_lsdrop(ifp, m, AF_INET, &ip->ip_src.s_addr, &ip->ip_dst.s_addr, 1)) goto freeit; #endif icmpstat_inc(icps_reflect); icmpstat_inc(icps_outhist + icp->icmp_type); if (!icmp_reflect(m, &opts, NULL)) { icmp_send(m, opts); m_free(opts); } return IPPROTO_DONE; case ICMP_REDIRECT: { struct sockaddr_in sdst; struct sockaddr_in sgw; struct sockaddr_in ssrc; struct rtentry *newrt = NULL; if (icmp_rediraccept == 0 || ipforwarding == 1) goto freeit; if (code > 3) goto badcode; if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { icmpstat_inc(icps_badlen); break; } /* * Short circuit routing redirects to force * immediate change in the kernel's routing * tables. The message is also handed to anyone * listening on a raw socket (e.g. the routing * daemon for use in updating its tables). */ memset(&sdst, 0, sizeof(sdst)); memset(&sgw, 0, sizeof(sgw)); memset(&ssrc, 0, sizeof(ssrc)); sdst.sin_family = sgw.sin_family = ssrc.sin_family = AF_INET; sdst.sin_len = sgw.sin_len = ssrc.sin_len = sizeof(sdst); memcpy(&sdst.sin_addr, &icp->icmp_ip.ip_dst, sizeof(sdst.sin_addr)); memcpy(&sgw.sin_addr, &icp->icmp_gwaddr, sizeof(sgw.sin_addr)); memcpy(&ssrc.sin_addr, &ip->ip_src, sizeof(ssrc.sin_addr)); #ifdef ICMPPRINTFS if (icmpprintfs) { char gw[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; inet_ntop(AF_INET, &icp->icmp_gwaddr, gw, sizeof(gw)); inet_ntop(AF_INET, &icp->icmp_ip.ip_dst, dst, sizeof(dst)); printf("redirect dst %s to %s\n", dst, gw); } #endif #if NCARP > 0 if (carp_lsdrop(ifp, m, AF_INET, &sdst.sin_addr.s_addr, &ip->ip_dst.s_addr, 1)) goto freeit; #endif rtredirect(sintosa(&sdst), sintosa(&sgw), sintosa(&ssrc), &newrt, m->m_pkthdr.ph_rtableid); if (newrt != NULL && icmp_redirtimeout > 0) { rt_timer_add(newrt, &icmp_redirect_timeout_q, m->m_pkthdr.ph_rtableid); } rtfree(newrt); pfctlinput(PRC_REDIRECT_HOST, sintosa(&sdst)); break; } /* * No kernel processing for the following; * just fall through to send to raw listener. */ case ICMP_ECHOREPLY: case ICMP_ROUTERADVERT: case ICMP_ROUTERSOLICIT: case ICMP_TSTAMPREPLY: case ICMP_IREQREPLY: case ICMP_MASKREPLY: case ICMP_TRACEROUTE: case ICMP_DATACONVERR: case ICMP_MOBILE_REDIRECT: case ICMP_IPV6_WHEREAREYOU: case ICMP_IPV6_IAMHERE: case ICMP_MOBILE_REGREQUEST: case ICMP_MOBILE_REGREPLY: case ICMP_PHOTURIS: default: break; } raw: return rip_input(mp, offp, proto, af); freeit: m_freem(m); return IPPROTO_DONE; } /* * Reflect the ip packet back to the source */ int icmp_reflect(struct mbuf *m, struct mbuf **op, struct in_ifaddr *ia) { struct ip *ip = mtod(m, struct ip *); struct mbuf *opts = NULL; struct sockaddr_in sin; struct rtentry *rt = NULL; int optlen = (ip->ip_hl << 2) - sizeof(struct ip); u_int rtableid; u_int8_t pfflags; if (!in_canforward(ip->ip_src) && ((ip->ip_src.s_addr & IN_CLASSA_NET) != htonl(IN_LOOPBACKNET << IN_CLASSA_NSHIFT))) { m_freem(m); /* Bad return address */ return (EHOSTUNREACH); } if (m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) { m_freem(m); return (ELOOP); } rtableid = m->m_pkthdr.ph_rtableid; pfflags = m->m_pkthdr.pf.flags; m_resethdr(m); m->m_pkthdr.ph_rtableid = rtableid; m->m_pkthdr.pf.flags = pfflags & PF_TAG_GENERATED; /* * If the incoming packet was addressed directly to us, * use dst as the src for the reply. For broadcast, use * the address which corresponds to the incoming interface. */ if (ia == NULL) { memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof(sin); sin.sin_family = AF_INET; sin.sin_addr = ip->ip_dst; rt = rtalloc(sintosa(&sin), 0, rtableid); if (rtisvalid(rt) && ISSET(rt->rt_flags, RTF_LOCAL|RTF_BROADCAST)) ia = ifatoia(rt->rt_ifa); } /* * The following happens if the packet was not addressed to us. * Use the new source address and do a route lookup. If it fails * drop the packet as there is no path to the host. */ if (ia == NULL) { rtfree(rt); memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof(sin); sin.sin_family = AF_INET; sin.sin_addr = ip->ip_src; /* keep packet in the original virtual instance */ rt = rtalloc(sintosa(&sin), RT_RESOLVE, rtableid); if (rt == NULL) { ipstat_inc(ips_noroute); m_freem(m); return (EHOSTUNREACH); } ia = ifatoia(rt->rt_ifa); } ip->ip_dst = ip->ip_src; ip->ip_ttl = MAXTTL; /* It is safe to dereference ``ia'' iff ``rt'' is valid. */ ip->ip_src = ia->ia_addr.sin_addr; rtfree(rt); if (optlen > 0) { u_char *cp; int opt, cnt; u_int len; /* * Retrieve any source routing from the incoming packet; * add on any record-route or timestamp options. */ cp = (u_char *) (ip + 1); if (op && (opts = ip_srcroute(m)) == NULL && (opts = m_gethdr(M_DONTWAIT, MT_HEADER))) { opts->m_len = sizeof(struct in_addr); mtod(opts, struct in_addr *)->s_addr = 0; } if (op && opts) { #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_reflect optlen %d rt %d => ", optlen, opts->m_len); #endif for (cnt = optlen; cnt > 0; cnt -= len, cp += len) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) len = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) break; len = cp[IPOPT_OLEN]; if (len < IPOPT_OLEN + sizeof(*cp) || len > cnt) break; } /* * Should check for overflow, but it * "can't happen" */ if (opt == IPOPT_RR || opt == IPOPT_TS || opt == IPOPT_SECURITY) { memcpy(mtod(opts, caddr_t) + opts->m_len, cp, len); opts->m_len += len; } } /* Terminate & pad, if necessary */ if ((cnt = opts->m_len % 4) != 0) for (; cnt < 4; cnt++) { *(mtod(opts, caddr_t) + opts->m_len) = IPOPT_EOL; opts->m_len++; } #ifdef ICMPPRINTFS if (icmpprintfs) printf("%d\n", opts->m_len); #endif } ip_stripoptions(m); } m->m_flags &= ~(M_BCAST|M_MCAST); if (op) *op = opts; return (0); } /* * Send an icmp packet back to the ip level */ void icmp_send(struct mbuf *m, struct mbuf *opts) { struct ip *ip = mtod(m, struct ip *); int hlen; struct icmp *icp; hlen = ip->ip_hl << 2; icp = (struct icmp *)(mtod(m, caddr_t) + hlen); icp->icmp_cksum = 0; m->m_pkthdr.csum_flags = M_ICMP_CSUM_OUT; #ifdef ICMPPRINTFS if (icmpprintfs) { char dst[INET_ADDRSTRLEN], src[INET_ADDRSTRLEN]; inet_ntop(AF_INET, &ip->ip_dst, dst, sizeof(dst)); inet_ntop(AF_INET, &ip->ip_src, src, sizeof(src)); printf("icmp_send dst %s src %s\n", dst, src); } #endif /* * ip_send() cannot handle IP options properly. So in case we have * options fill out the IP header here and use ip_send_raw() instead. */ if (opts != NULL) { m = ip_insertoptions(m, opts, &hlen); ip = mtod(m, struct ip *); ip->ip_hl = (hlen >> 2); ip->ip_v = IPVERSION; ip->ip_off &= htons(IP_DF); ip->ip_id = htons(ip_randomid()); ipstat_inc(ips_localout); ip_send_raw(m); } else ip_send(m); } u_int32_t iptime(void) { struct timeval atv; u_long t; microtime(&atv); t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000; return (htonl(t)); } int icmp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { int error; /* All sysctl names at this level are terminal. */ if (namelen != 1) return (ENOTDIR); switch (name[0]) { case ICMPCTL_REDIRTIMEOUT: NET_LOCK(); error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &icmp_redirtimeout, 0, INT_MAX); rt_timer_queue_change(&icmp_redirect_timeout_q, icmp_redirtimeout); NET_UNLOCK(); break; case ICMPCTL_STATS: error = icmp_sysctl_icmpstat(oldp, oldlenp, newp); break; default: NET_LOCK(); error = sysctl_bounded_arr(icmpctl_vars, nitems(icmpctl_vars), name, namelen, oldp, oldlenp, newp, newlen); NET_UNLOCK(); break; } return (error); } int icmp_sysctl_icmpstat(void *oldp, size_t *oldlenp, void *newp) { uint64_t counters[icps_ncounters]; struct icmpstat icmpstat; u_long *words = (u_long *)&icmpstat; int i; CTASSERT(sizeof(icmpstat) == (nitems(counters) * sizeof(u_long))); memset(&icmpstat, 0, sizeof icmpstat); counters_read(icmpcounters, counters, nitems(counters)); for (i = 0; i < nitems(counters); i++) words[i] = (u_long)counters[i]; return (sysctl_rdstruct(oldp, oldlenp, newp, &icmpstat, sizeof(icmpstat))); } struct rtentry * icmp_mtudisc_clone(struct in_addr dst, u_int rtableid, int ipsec) { struct sockaddr_in sin; struct rtentry *rt; int error; memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_addr = dst; rt = rtalloc(sintosa(&sin), RT_RESOLVE, rtableid); /* Check if the route is actually usable */ if (!rtisvalid(rt)) goto bad; /* IPsec needs the route only for PMTU, it can use reject for that */ if (!ipsec && (rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE))) goto bad; /* * No PMTU for local routes and permanent neighbors, * ARP and NDP use the same expire timer as the route. */ if (ISSET(rt->rt_flags, RTF_LOCAL) || (ISSET(rt->rt_flags, RTF_LLINFO) && rt->rt_expire == 0)) goto bad; /* If we didn't get a host route, allocate one */ if ((rt->rt_flags & RTF_HOST) == 0) { struct rtentry *nrt; struct rt_addrinfo info; struct sockaddr_rtlabel sa_rl; memset(&info, 0, sizeof(info)); info.rti_ifa = rt->rt_ifa; info.rti_flags = RTF_GATEWAY | RTF_HOST | RTF_DYNAMIC; info.rti_info[RTAX_DST] = sintosa(&sin); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_LABEL] = rtlabel_id2sa(rt->rt_labelid, &sa_rl); error = rtrequest(RTM_ADD, &info, rt->rt_priority, &nrt, rtableid); if (error) goto bad; nrt->rt_rmx = rt->rt_rmx; rtfree(rt); rt = nrt; rtm_send(rt, RTM_ADD, 0, rtableid); } error = rt_timer_add(rt, &ip_mtudisc_timeout_q, rtableid); if (error) goto bad; return (rt); bad: rtfree(rt); return (NULL); } /* Table of common MTUs: */ static const u_short mtu_table[] = { 65535, 65280, 32000, 17914, 9180, 8166, 4352, 2002, 1492, 1006, 508, 296, 68, 0 }; void icmp_mtudisc(struct icmp *icp, u_int rtableid) { struct rtentry *rt; struct ifnet *ifp; u_long mtu = ntohs(icp->icmp_nextmtu); /* Why a long? IPv6 */ rt = icmp_mtudisc_clone(icp->icmp_ip.ip_dst, rtableid, 0); if (rt == NULL) return; ifp = if_get(rt->rt_ifidx); if (ifp == NULL) { rtfree(rt); return; } if (mtu == 0) { int i = 0; mtu = ntohs(icp->icmp_ip.ip_len); /* Some 4.2BSD-based routers incorrectly adjust the ip_len */ if (mtu > rt->rt_mtu && rt->rt_mtu != 0) mtu -= (icp->icmp_ip.ip_hl << 2); /* If we still can't guess a value, try the route */ if (mtu == 0) { mtu = rt->rt_mtu; /* If no route mtu, default to the interface mtu */ if (mtu == 0) mtu = ifp->if_mtu; } for (i = 0; i < nitems(mtu_table); i++) if (mtu > mtu_table[i]) { mtu = mtu_table[i]; break; } } /* * XXX: RTV_MTU is overloaded, since the admin can set it * to turn off PMTU for a route, and the kernel can * set it to indicate a serious problem with PMTU * on a route. We should be using a separate flag * for the kernel to indicate this. */ if ((rt->rt_locks & RTV_MTU) == 0) { if (mtu < 296 || mtu > ifp->if_mtu) rt->rt_locks |= RTV_MTU; else if (rt->rt_mtu > mtu || rt->rt_mtu == 0) rt->rt_mtu = mtu; } if_put(ifp); rtfree(rt); } void icmp_mtudisc_timeout(struct rtentry *rt, u_int rtableid) { struct ifnet *ifp; NET_ASSERT_LOCKED(); ifp = if_get(rt->rt_ifidx); if (ifp == NULL) return; if ((rt->rt_flags & (RTF_DYNAMIC|RTF_HOST)) == (RTF_DYNAMIC|RTF_HOST)) { void (*ctlfunc)(int, struct sockaddr *, u_int, void *); struct sockaddr_in sin; sin = *satosin(rt_key(rt)); rtdeletemsg(rt, ifp, rtableid); /* Notify TCP layer of increased Path MTU estimate */ ctlfunc = inetsw[ip_protox[IPPROTO_TCP]].pr_ctlinput; if (ctlfunc) (*ctlfunc)(PRC_MTUINC, sintosa(&sin), rtableid, NULL); } else { if ((rt->rt_locks & RTV_MTU) == 0) rt->rt_mtu = 0; } if_put(ifp); } /* * Perform rate limit check. * Returns 0 if it is okay to send the icmp packet. * Returns 1 if the router SHOULD NOT send this icmp packet due to rate * limitation. * * XXX per-destination/type check necessary? */ int icmp_ratelimit(const struct in_addr *dst, const int type, const int code) { /* PPS limit */ if (!ppsratecheck(&icmperrppslim_last, &icmperrpps_count, icmperrppslim)) return 1; /* The packet is subject to rate limit */ return 0; /* okay to send */ } int icmp_do_exthdr(struct mbuf *m, u_int16_t class, u_int8_t ctype, void *buf, size_t len) { struct ip *ip = mtod(m, struct ip *); int hlen, off; struct mbuf *n; struct icmp *icp; struct icmp_ext_hdr *ieh; struct { struct icmp_ext_hdr ieh; struct icmp_ext_obj_hdr ieo; } hdr; hlen = ip->ip_hl << 2; icp = (struct icmp *)(mtod(m, caddr_t) + hlen); if (icp->icmp_type != ICMP_TIMXCEED && icp->icmp_type != ICMP_UNREACH && icp->icmp_type != ICMP_PARAMPROB) /* exthdr not supported */ return (0); if (icp->icmp_length != 0) /* exthdr already present, giving up */ return (0); /* the actual offset starts after the common ICMP header */ hlen += ICMP_MINLEN; /* exthdr must start on a word boundary */ off = roundup(ntohs(ip->ip_len) - hlen, sizeof(u_int32_t)); /* ... and at an offset of ICMP_EXT_OFFSET or bigger */ off = max(off, ICMP_EXT_OFFSET); icp->icmp_length = off / sizeof(u_int32_t); memset(&hdr, 0, sizeof(hdr)); hdr.ieh.ieh_version = ICMP_EXT_HDR_VERSION; hdr.ieo.ieo_length = htons(sizeof(struct icmp_ext_obj_hdr) + len); hdr.ieo.ieo_cnum = class; hdr.ieo.ieo_ctype = ctype; if (m_copyback(m, hlen + off, sizeof(hdr), &hdr, M_NOWAIT) || m_copyback(m, hlen + off + sizeof(hdr), len, buf, M_NOWAIT)) { m_freem(m); return (ENOBUFS); } /* calculate checksum */ n = m_getptr(m, hlen + off, &off); if (n == NULL) panic("icmp_do_exthdr: m_getptr failure"); ieh = (struct icmp_ext_hdr *)(mtod(n, caddr_t) + off); ieh->ieh_cksum = in4_cksum(n, 0, off, sizeof(hdr) + len); ip->ip_len = htons(m->m_pkthdr.len); return (0); }
81 82 16 8 7 1 1 1 8 8 4 4 4 8 8 403 402 18 9 9 4 12 11 1 1 5097 5107 5098 5097 5102 5088 87 450 447 449 87 87 79 54 87 403 55 54 55 55 53 1 15 45 55 54 55 55 54 14 13 14 8 11 2 13 13 161 156 3 1 2 24 9 13 3 2 2 1 2 2 7 1 8 7 1 6 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 /* $OpenBSD: kern_sig.c,v 1.299 2022/08/14 01:58:27 jsg Exp $ */ /* $NetBSD: kern_sig.c,v 1.54 1996/04/22 01:38:32 christos Exp $ */ /* * Copyright (c) 1997 Theo de Raadt. All rights reserved. * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 */ #include <sys/param.h> #include <sys/signalvar.h> #include <sys/queue.h> #include <sys/namei.h> #include <sys/vnode.h> #include <sys/event.h> #include <sys/proc.h> #include <sys/systm.h> #include <sys/acct.h> #include <sys/fcntl.h> #include <sys/filedesc.h> #include <sys/wait.h> #include <sys/ktrace.h> #include <sys/stat.h> #include <sys/malloc.h> #include <sys/pool.h> #include <sys/sched.h> #include <sys/user.h> #include <sys/syslog.h> #include <sys/ttycom.h> #include <sys/pledge.h> #include <sys/witness.h> #include <sys/exec_elf.h> #include <sys/mount.h> #include <sys/syscallargs.h> #include <uvm/uvm_extern.h> #include <machine/tcb.h> int nosuidcoredump = 1; int filt_sigattach(struct knote *kn); void filt_sigdetach(struct knote *kn); int filt_signal(struct knote *kn, long hint); const struct filterops sig_filtops = { .f_flags = 0, .f_attach = filt_sigattach, .f_detach = filt_sigdetach, .f_event = filt_signal, }; /* * The array below categorizes the signals and their default actions. */ const int sigprop[NSIG] = { 0, /* unused */ SA_KILL, /* SIGHUP */ SA_KILL, /* SIGINT */ SA_KILL|SA_CORE, /* SIGQUIT */ SA_KILL|SA_CORE, /* SIGILL */ SA_KILL|SA_CORE, /* SIGTRAP */ SA_KILL|SA_CORE, /* SIGABRT */ SA_KILL|SA_CORE, /* SIGEMT */ SA_KILL|SA_CORE, /* SIGFPE */ SA_KILL, /* SIGKILL */ SA_KILL|SA_CORE, /* SIGBUS */ SA_KILL|SA_CORE, /* SIGSEGV */ SA_KILL|SA_CORE, /* SIGSYS */ SA_KILL, /* SIGPIPE */ SA_KILL, /* SIGALRM */ SA_KILL, /* SIGTERM */ SA_IGNORE, /* SIGURG */ SA_STOP, /* SIGSTOP */ SA_STOP|SA_TTYSTOP, /* SIGTSTP */ SA_IGNORE|SA_CONT, /* SIGCONT */ SA_IGNORE, /* SIGCHLD */ SA_STOP|SA_TTYSTOP, /* SIGTTIN */ SA_STOP|SA_TTYSTOP, /* SIGTTOU */ SA_IGNORE, /* SIGIO */ SA_KILL, /* SIGXCPU */ SA_KILL, /* SIGXFSZ */ SA_KILL, /* SIGVTALRM */ SA_KILL, /* SIGPROF */ SA_IGNORE, /* SIGWINCH */ SA_IGNORE, /* SIGINFO */ SA_KILL, /* SIGUSR1 */ SA_KILL, /* SIGUSR2 */ SA_IGNORE, /* SIGTHR */ }; #define CONTSIGMASK (sigmask(SIGCONT)) #define STOPSIGMASK (sigmask(SIGSTOP) | sigmask(SIGTSTP) | \ sigmask(SIGTTIN) | sigmask(SIGTTOU)) void setsigvec(struct proc *, int, struct sigaction *); void proc_stop(struct proc *p, int); void proc_stop_sweep(void *); void *proc_stop_si; void setsigctx(struct proc *, int, struct sigctx *); void postsig_done(struct proc *, int, sigset_t, int); void postsig(struct proc *, int, struct sigctx *); int cansignal(struct proc *, struct process *, int); struct pool sigacts_pool; /* memory pool for sigacts structures */ void sigio_del(struct sigiolst *); void sigio_unlink(struct sigio_ref *, struct sigiolst *); struct mutex sigio_lock = MUTEX_INITIALIZER(IPL_HIGH); /* * Can thread p, send the signal signum to process qr? */ int cansignal(struct proc *p, struct process *qr, int signum) { struct process *pr = p->p_p; struct ucred *uc = p->p_ucred; struct ucred *quc = qr->ps_ucred; if (uc->cr_uid == 0) return (1); /* root can always signal */ if (pr == qr) return (1); /* process can always signal itself */ /* optimization: if the same creds then the tests below will pass */ if (uc == quc) return (1); if (signum == SIGCONT && qr->ps_session == pr->ps_session) return (1); /* SIGCONT in session */ /* * Using kill(), only certain signals can be sent to setugid * child processes */ if (qr->ps_flags & PS_SUGID) { switch (signum) { case 0: case SIGKILL: case SIGINT: case SIGTERM: case SIGALRM: case SIGSTOP: case SIGTTIN: case SIGTTOU: case SIGTSTP: case SIGHUP: case SIGUSR1: case SIGUSR2: if (uc->cr_ruid == quc->cr_ruid || uc->cr_uid == quc->cr_ruid) return (1); } return (0); } if (uc->cr_ruid == quc->cr_ruid || uc->cr_ruid == quc->cr_svuid || uc->cr_uid == quc->cr_ruid || uc->cr_uid == quc->cr_svuid) return (1); return (0); } /* * Initialize signal-related data structures. */ void signal_init(void) { proc_stop_si = softintr_establish(IPL_SOFTCLOCK, proc_stop_sweep, NULL); if (proc_stop_si == NULL) panic("signal_init failed to register softintr"); pool_init(&sigacts_pool, sizeof(struct sigacts), 0, IPL_NONE, PR_WAITOK, "sigapl", NULL); } /* * Initialize a new sigaltstack structure. */ void sigstkinit(struct sigaltstack *ss) { ss->ss_flags = SS_DISABLE; ss->ss_size = 0; ss->ss_sp = NULL; } /* * Create an initial sigacts structure, using the same signal state * as pr. */ struct sigacts * sigactsinit(struct process *pr) { struct sigacts *ps; ps = pool_get(&sigacts_pool, PR_WAITOK); memcpy(ps, pr->ps_sigacts, sizeof(struct sigacts)); return (ps); } /* * Release a sigacts structure. */ void sigactsfree(struct sigacts *ps) { pool_put(&sigacts_pool, ps); } int sys_sigaction(struct proc *p, void *v, register_t *retval) { struct sys_sigaction_args /* { syscallarg(int) signum; syscallarg(const struct sigaction *) nsa; syscallarg(struct sigaction *) osa; } */ *uap = v; struct sigaction vec; #ifdef KTRACE struct sigaction ovec; #endif struct sigaction *sa; const struct sigaction *nsa; struct sigaction *osa; struct sigacts *ps = p->p_p->ps_sigacts; int signum; int bit, error; signum = SCARG(uap, signum); nsa = SCARG(uap, nsa); osa = SCARG(uap, osa); if (signum <= 0 || signum >= NSIG || (nsa && (signum == SIGKILL || signum == SIGSTOP))) return (EINVAL); sa = &vec; if (osa) { mtx_enter(&p->p_p->ps_mtx); sa->sa_handler = ps->ps_sigact[signum]; sa->sa_mask = ps->ps_catchmask[signum]; bit = sigmask(signum); sa->sa_flags = 0; if ((ps->ps_sigonstack & bit) != 0) sa->sa_flags |= SA_ONSTACK; if ((ps->ps_sigintr & bit) == 0) sa->sa_flags |= SA_RESTART; if ((ps->ps_sigreset & bit) != 0) sa->sa_flags |= SA_RESETHAND; if ((ps->ps_siginfo & bit) != 0) sa->sa_flags |= SA_SIGINFO; if (signum == SIGCHLD) { if ((ps->ps_sigflags & SAS_NOCLDSTOP) != 0) sa->sa_flags |= SA_NOCLDSTOP; if ((ps->ps_sigflags & SAS_NOCLDWAIT) != 0) sa->sa_flags |= SA_NOCLDWAIT; } mtx_leave(&p->p_p->ps_mtx); if ((sa->sa_mask & bit) == 0) sa->sa_flags |= SA_NODEFER; sa->sa_mask &= ~bit; error = copyout(sa, osa, sizeof (vec)); if (error) return (error); #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ovec = vec; #endif } if (nsa) { error = copyin(nsa, sa, sizeof (vec)); if (error) return (error); #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktrsigaction(p, sa); #endif setsigvec(p, signum, sa); } #ifdef KTRACE if (osa && KTRPOINT(p, KTR_STRUCT)) ktrsigaction(p, &ovec); #endif return (0); } void setsigvec(struct proc *p, int signum, struct sigaction *sa) { struct sigacts *ps = p->p_p->ps_sigacts; int bit; bit = sigmask(signum); mtx_enter(&p->p_p->ps_mtx); ps->ps_sigact[signum] = sa->sa_handler; if ((sa->sa_flags & SA_NODEFER) == 0) sa->sa_mask |= sigmask(signum); ps->ps_catchmask[signum] = sa->sa_mask &~ sigcantmask; if (signum == SIGCHLD) { if (sa->sa_flags & SA_NOCLDSTOP) atomic_setbits_int(&ps->ps_sigflags, SAS_NOCLDSTOP); else atomic_clearbits_int(&ps->ps_sigflags, SAS_NOCLDSTOP); /* * If the SA_NOCLDWAIT flag is set or the handler * is SIG_IGN we reparent the dying child to PID 1 * (init) which will reap the zombie. Because we use * init to do our dirty work we never set SAS_NOCLDWAIT * for PID 1. * XXX exit1 rework means this is unnecessary? */ if (initprocess->ps_sigacts != ps && ((sa->sa_flags & SA_NOCLDWAIT) || sa->sa_handler == SIG_IGN)) atomic_setbits_int(&ps->ps_sigflags, SAS_NOCLDWAIT); else atomic_clearbits_int(&ps->ps_sigflags, SAS_NOCLDWAIT); } if ((sa->sa_flags & SA_RESETHAND) != 0) ps->ps_sigreset |= bit; else ps->ps_sigreset &= ~bit; if ((sa->sa_flags & SA_SIGINFO) != 0) ps->ps_siginfo |= bit; else ps->ps_siginfo &= ~bit; if ((sa->sa_flags & SA_RESTART) == 0) ps->ps_sigintr |= bit; else ps->ps_sigintr &= ~bit; if ((sa->sa_flags & SA_ONSTACK) != 0) ps->ps_sigonstack |= bit; else ps->ps_sigonstack &= ~bit; /* * Set bit in ps_sigignore for signals that are set to SIG_IGN, * and for signals set to SIG_DFL where the default is to ignore. * However, don't put SIGCONT in ps_sigignore, * as we have to restart the process. */ if (sa->sa_handler == SIG_IGN || (sigprop[signum] & SA_IGNORE && sa->sa_handler == SIG_DFL)) { atomic_clearbits_int(&p->p_siglist, bit); atomic_clearbits_int(&p->p_p->ps_siglist, bit); if (signum != SIGCONT) ps->ps_sigignore |= bit; /* easier in psignal */ ps->ps_sigcatch &= ~bit; } else { ps->ps_sigignore &= ~bit; if (sa->sa_handler == SIG_DFL) ps->ps_sigcatch &= ~bit; else ps->ps_sigcatch |= bit; } mtx_leave(&p->p_p->ps_mtx); } /* * Initialize signal state for process 0; * set to ignore signals that are ignored by default. */ void siginit(struct sigacts *ps) { int i; for (i = 0; i < NSIG; i++) if (sigprop[i] & SA_IGNORE && i != SIGCONT) ps->ps_sigignore |= sigmask(i); ps->ps_sigflags = SAS_NOCLDWAIT | SAS_NOCLDSTOP; } /* * Reset signals for an exec by the specified thread. */ void execsigs(struct proc *p) { struct sigacts *ps; int nc, mask; ps = p->p_p->ps_sigacts; mtx_enter(&p->p_p->ps_mtx); /* * Reset caught signals. Held signals remain held * through p_sigmask (unless they were caught, * and are now ignored by default). */ while (ps->ps_sigcatch) { nc = ffs((long)ps->ps_sigcatch); mask = sigmask(nc); ps->ps_sigcatch &= ~mask; if (sigprop[nc] & SA_IGNORE) { if (nc != SIGCONT) ps->ps_sigignore |= mask; atomic_clearbits_int(&p->p_siglist, mask); atomic_clearbits_int(&p->p_p->ps_siglist, mask); } ps->ps_sigact[nc] = SIG_DFL; } /* * Reset stack state to the user stack. * Clear set of signals caught on the signal stack. */ sigstkinit(&p->p_sigstk); atomic_clearbits_int(&ps->ps_sigflags, SAS_NOCLDWAIT); if (ps->ps_sigact[SIGCHLD] == SIG_IGN) ps->ps_sigact[SIGCHLD] = SIG_DFL; mtx_leave(&p->p_p->ps_mtx); } /* * Manipulate signal mask. * Note that we receive new mask, not pointer, * and return old mask as return value; * the library stub does the rest. */ int sys_sigprocmask(struct proc *p, void *v, register_t *retval) { struct sys_sigprocmask_args /* { syscallarg(int) how; syscallarg(sigset_t) mask; } */ *uap = v; int error = 0; sigset_t mask; KASSERT(p == curproc); *retval = p->p_sigmask; mask = SCARG(uap, mask) &~ sigcantmask; switch (SCARG(uap, how)) { case SIG_BLOCK: atomic_setbits_int(&p->p_sigmask, mask); break; case SIG_UNBLOCK: atomic_clearbits_int(&p->p_sigmask, mask); break; case SIG_SETMASK: p->p_sigmask = mask; break; default: error = EINVAL; break; } return (error); } int sys_sigpending(struct proc *p, void *v, register_t *retval) { *retval = p->p_siglist | p->p_p->ps_siglist; return (0); } /* * Temporarily replace calling proc's signal mask for the duration of a * system call. Original signal mask will be restored by userret(). */ void dosigsuspend(struct proc *p, sigset_t newmask) { KASSERT(p == curproc); p->p_oldmask = p->p_sigmask; atomic_setbits_int(&p->p_flag, P_SIGSUSPEND); p->p_sigmask = newmask; } /* * Suspend thread until signal, providing mask to be set * in the meantime. Note nonstandard calling convention: * libc stub passes mask, not pointer, to save a copyin. */ int sys_sigsuspend(struct proc *p, void *v, register_t *retval) { struct sys_sigsuspend_args /* { syscallarg(int) mask; } */ *uap = v; dosigsuspend(p, SCARG(uap, mask) &~ sigcantmask); while (tsleep_nsec(&nowake, PPAUSE|PCATCH, "sigsusp", INFSLP) == 0) continue; /* always return EINTR rather than ERESTART... */ return (EINTR); } int sigonstack(size_t stack) { const struct sigaltstack *ss = &curproc->p_sigstk; return (ss->ss_flags & SS_DISABLE ? 0 : (stack - (size_t)ss->ss_sp < ss->ss_size)); } int sys_sigaltstack(struct proc *p, void *v, register_t *retval) { struct sys_sigaltstack_args /* { syscallarg(const struct sigaltstack *) nss; syscallarg(struct sigaltstack *) oss; } */ *uap = v; struct sigaltstack ss; const struct sigaltstack *nss; struct sigaltstack *oss; int onstack = sigonstack(PROC_STACK(p)); int error; nss = SCARG(uap, nss); oss = SCARG(uap, oss); if (oss != NULL) { ss = p->p_sigstk; if (onstack) ss.ss_flags |= SS_ONSTACK; if ((error = copyout(&ss, oss, sizeof(ss)))) return (error); } if (nss == NULL) return (0); error = copyin(nss, &ss, sizeof(ss)); if (error) return (error); if (onstack) return (EPERM); if (ss.ss_flags & ~SS_DISABLE) return (EINVAL); if (ss.ss_flags & SS_DISABLE) { p->p_sigstk.ss_flags = ss.ss_flags; return (0); } if (ss.ss_size < MINSIGSTKSZ) return (ENOMEM); error = uvm_map_remap_as_stack(p, (vaddr_t)ss.ss_sp, ss.ss_size); if (error) return (error); p->p_sigstk = ss; return (0); } int sys_kill(struct proc *cp, void *v, register_t *retval) { struct sys_kill_args /* { syscallarg(int) pid; syscallarg(int) signum; } */ *uap = v; struct process *pr; int pid = SCARG(uap, pid); int signum = SCARG(uap, signum); int error; int zombie = 0; if ((error = pledge_kill(cp, pid)) != 0) return (error); if (((u_int)signum) >= NSIG) return (EINVAL); if (pid > 0) { if ((pr = prfind(pid)) == NULL) { if ((pr = zombiefind(pid)) == NULL) return (ESRCH); else zombie = 1; } if (!cansignal(cp, pr, signum)) return (EPERM); /* kill single process */ if (signum && !zombie) prsignal(pr, signum); return (0); } switch (pid) { case -1: /* broadcast signal */ return (killpg1(cp, signum, 0, 1)); case 0: /* signal own process group */ return (killpg1(cp, signum, 0, 0)); default: /* negative explicit process group */ return (killpg1(cp, signum, -pid, 0)); } } int sys_thrkill(struct proc *cp, void *v, register_t *retval) { struct sys_thrkill_args /* { syscallarg(pid_t) tid; syscallarg(int) signum; syscallarg(void *) tcb; } */ *uap = v; struct proc *p; int tid = SCARG(uap, tid); int signum = SCARG(uap, signum); void *tcb; if (((u_int)signum) >= NSIG) return (EINVAL); if (tid > THREAD_PID_OFFSET) { if ((p = tfind(tid - THREAD_PID_OFFSET)) == NULL) return (ESRCH); /* can only kill threads in the same process */ if (p->p_p != cp->p_p) return (ESRCH); } else if (tid == 0) p = cp; else return (EINVAL); /* optionally require the target thread to have the given tcb addr */ tcb = SCARG(uap, tcb); if (tcb != NULL && tcb != TCB_GET(p)) return (ESRCH); if (signum) ptsignal(p, signum, STHREAD); return (0); } /* * Common code for kill process group/broadcast kill. * cp is calling process. */ int killpg1(struct proc *cp, int signum, int pgid, int all) { struct process *pr; struct pgrp *pgrp; int nfound = 0; if (all) { /* * broadcast */ LIST_FOREACH(pr, &allprocess, ps_list) { if (pr->ps_pid <= 1 || pr->ps_flags & (PS_SYSTEM | PS_NOBROADCASTKILL) || pr == cp->p_p || !cansignal(cp, pr, signum)) continue; nfound++; if (signum) prsignal(pr, signum); } } else { if (pgid == 0) /* * zero pgid means send to my process group. */ pgrp = cp->p_p->ps_pgrp; else { pgrp = pgfind(pgid); if (pgrp == NULL) return (ESRCH); } LIST_FOREACH(pr, &pgrp->pg_members, ps_pglist) { if (pr->ps_pid <= 1 || pr->ps_flags & PS_SYSTEM || !cansignal(cp, pr, signum)) continue; nfound++; if (signum) prsignal(pr, signum); } } return (nfound ? 0 : ESRCH); } #define CANDELIVER(uid, euid, pr) \ (euid == 0 || \ (uid) == (pr)->ps_ucred->cr_ruid || \ (uid) == (pr)->ps_ucred->cr_svuid || \ (uid) == (pr)->ps_ucred->cr_uid || \ (euid) == (pr)->ps_ucred->cr_ruid || \ (euid) == (pr)->ps_ucred->cr_svuid || \ (euid) == (pr)->ps_ucred->cr_uid) #define CANSIGIO(cr, pr) \ CANDELIVER((cr)->cr_ruid, (cr)->cr_uid, (pr)) /* * Send a signal to a process group. If checktty is 1, * limit to members which have a controlling terminal. */ void pgsignal(struct pgrp *pgrp, int signum, int checkctty) { struct process *pr; if (pgrp) LIST_FOREACH(pr, &pgrp->pg_members, ps_pglist) if (checkctty == 0 || pr->ps_flags & PS_CONTROLT) prsignal(pr, signum); } /* * Send a SIGIO or SIGURG signal to a process or process group using stored * credentials rather than those of the current process. */ void pgsigio(struct sigio_ref *sir, int sig, int checkctty) { struct process *pr; struct sigio *sigio; if (sir->sir_sigio == NULL) return; KERNEL_LOCK(); mtx_enter(&sigio_lock); sigio = sir->sir_sigio; if (sigio == NULL) goto out; if (sigio->sio_pgid > 0) { if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc)) prsignal(sigio->sio_proc, sig); } else if (sigio->sio_pgid < 0) { LIST_FOREACH(pr, &sigio->sio_pgrp->pg_members, ps_pglist) { if (CANSIGIO(sigio->sio_ucred, pr) && (checkctty == 0 || (pr->ps_flags & PS_CONTROLT))) prsignal(pr, sig); } } out: mtx_leave(&sigio_lock); KERNEL_UNLOCK(); } /* * Recalculate the signal mask and reset the signal disposition after * usermode frame for delivery is formed. */ void postsig_done(struct proc *p, int signum, sigset_t catchmask, int reset) { p->p_ru.ru_nsignals++; atomic_setbits_int(&p->p_sigmask, catchmask); if (reset != 0) { sigset_t mask = sigmask(signum); struct sigacts *ps = p->p_p->ps_sigacts; mtx_enter(&p->p_p->ps_mtx); ps->ps_sigcatch &= ~mask; if (signum != SIGCONT && sigprop[signum] & SA_IGNORE) ps->ps_sigignore |= mask; ps->ps_sigact[signum] = SIG_DFL; mtx_leave(&p->p_p->ps_mtx); } } /* * Send a signal caused by a trap to the current thread * If it will be caught immediately, deliver it with correct code. * Otherwise, post it normally. */ void trapsignal(struct proc *p, int signum, u_long trapno, int code, union sigval sigval) { struct process *pr = p->p_p; struct sigctx ctx; int mask; switch (signum) { case SIGILL: case SIGBUS: case SIGSEGV: pr->ps_acflag |= ATRAP; break; } mask = sigmask(signum); setsigctx(p, signum, &ctx); if ((pr->ps_flags & PS_TRACED) == 0 && ctx.sig_catch != 0 && (p->p_sigmask & mask) == 0) { siginfo_t si; initsiginfo(&si, signum, trapno, code, sigval); #ifdef KTRACE if (KTRPOINT(p, KTR_PSIG)) { ktrpsig(p, signum, ctx.sig_action, p->p_sigmask, code, &si); } #endif if (sendsig(ctx.sig_action, signum, p->p_sigmask, &si, ctx.sig_info, ctx.sig_onstack)) { KERNEL_LOCK(); sigexit(p, SIGILL); /* NOTREACHED */ } postsig_done(p, signum, ctx.sig_catchmask, ctx.sig_reset); } else { p->p_sisig = signum; p->p_sitrapno = trapno; /* XXX for core dump/debugger */ p->p_sicode = code; p->p_sigval = sigval; /* * If traced, stop if signal is masked, and stay stopped * until released by the debugger. If our parent process * is waiting for us, don't hang as we could deadlock. */ if (((pr->ps_flags & (PS_TRACED | PS_PPWAIT)) == PS_TRACED) && signum != SIGKILL && (p->p_sigmask & mask) != 0) { int s; single_thread_set(p, SINGLE_SUSPEND, 0); pr->ps_xsig = signum; SCHED_LOCK(s); proc_stop(p, 1); SCHED_UNLOCK(s); signum = pr->ps_xsig; single_thread_clear(p, 0); /* * If we are no longer being traced, or the parent * didn't give us a signal, skip sending the signal. */ if ((pr->ps_flags & PS_TRACED) == 0 || signum == 0) return; /* update signal info */ p->p_sisig = signum; mask = sigmask(signum); } /* * Signals like SIGBUS and SIGSEGV should not, when * generated by the kernel, be ignorable or blockable. * If it is and we're not being traced, then just kill * the process. * After vfs_shutdown(9), init(8) cannot receive signals * because new code pages of the signal handler cannot be * mapped from halted storage. init(8) may not die or the * kernel panics. Better loop between signal handler and * page fault trap until the machine is halted. */ if ((pr->ps_flags & PS_TRACED) == 0 && (sigprop[signum] & SA_KILL) && ((p->p_sigmask & mask) || ctx.sig_ignore) && pr->ps_pid != 1) { KERNEL_LOCK(); sigexit(p, signum); /* NOTREACHED */ } KERNEL_LOCK(); ptsignal(p, signum, STHREAD); KERNEL_UNLOCK(); } } /* * Send the signal to the process. If the signal has an action, the action * is usually performed by the target process rather than the caller; we add * the signal to the set of pending signals for the process. * * Exceptions: * o When a stop signal is sent to a sleeping process that takes the * default action, the process is stopped without awakening it. * o SIGCONT restarts stopped processes (or puts them back to sleep) * regardless of the signal action (eg, blocked or ignored). * * Other ignored signals are discarded immediately. */ void psignal(struct proc *p, int signum) { ptsignal(p, signum, SPROCESS); } /* * type = SPROCESS process signal, can be diverted (sigwait()) * type = STHREAD thread signal, but should be propagated if unhandled * type = SPROPAGATED propagated to this thread, so don't propagate again */ void ptsignal(struct proc *p, int signum, enum signal_type type) { int s, prop; sig_t action; int mask; int *siglist; struct process *pr = p->p_p; struct proc *q; int wakeparent = 0; KERNEL_ASSERT_LOCKED(); #ifdef DIAGNOSTIC if ((u_int)signum >= NSIG || signum == 0) panic("psignal signal number"); #endif /* Ignore signal if the target process is exiting */ if (pr->ps_flags & PS_EXITING) return; mask = sigmask(signum); if (type == SPROCESS) { /* Accept SIGKILL to coredumping processes */ if (pr->ps_flags & PS_COREDUMP && signum == SIGKILL) { atomic_setbits_int(&pr->ps_siglist, mask); return; } /* * If the current thread can process the signal * immediately (it's unblocked) then have it take it. */ q = curproc; if (q != NULL && q->p_p == pr && (q->p_flag & P_WEXIT) == 0 && (q->p_sigmask & mask) == 0) p = q; else { /* * A process-wide signal can be diverted to a * different thread that's in sigwait() for this * signal. If there isn't such a thread, then * pick a thread that doesn't have it blocked so * that the stop/kill consideration isn't * delayed. Otherwise, mark it pending on the * main thread. */ TAILQ_FOREACH(q, &pr->ps_threads, p_thr_link) { /* ignore exiting threads */ if (q->p_flag & P_WEXIT) continue; /* skip threads that have the signal blocked */ if ((q->p_sigmask & mask) != 0) continue; /* okay, could send to this thread */ p = q; /* * sigsuspend, sigwait, ppoll/pselect, etc? * Definitely go to this thread, as it's * already blocked in the kernel. */ if (q->p_flag & P_SIGSUSPEND) break; } } } if (type != SPROPAGATED) KNOTE(&pr->ps_klist, NOTE_SIGNAL | signum); prop = sigprop[signum]; /* * If proc is traced, always give parent a chance. */ if (pr->ps_flags & PS_TRACED) { action = SIG_DFL; } else { sigset_t sigcatch, sigignore; /* * If the signal is being ignored, * then we forget about it immediately. * (Note: we don't set SIGCONT in ps_sigignore, * and if it is set to SIG_IGN, * action will be SIG_DFL here.) */ mtx_enter(&pr->ps_mtx); sigignore = pr->ps_sigacts->ps_sigignore; sigcatch = pr->ps_sigacts->ps_sigcatch; mtx_leave(&pr->ps_mtx); if (sigignore & mask) return; if (p->p_sigmask & mask) { action = SIG_HOLD; } else if (sigcatch & mask) { action = SIG_CATCH; } else { action = SIG_DFL; if (prop & SA_KILL && pr->ps_nice > NZERO) pr->ps_nice = NZERO; /* * If sending a tty stop signal to a member of an * orphaned process group, discard the signal here if * the action is default; don't stop the process below * if sleeping, and don't clear any pending SIGCONT. */ if (prop & SA_TTYSTOP && pr->ps_pgrp->pg_jobc == 0) return; } } /* * If delivered to process, mark as pending there. Continue and stop * signals will be propagated to all threads. So they are always * marked at thread level. */ siglist = (type == SPROCESS) ? &pr->ps_siglist : &p->p_siglist; if (prop & SA_CONT) { siglist = &p->p_siglist; atomic_clearbits_int(siglist, STOPSIGMASK); } if (prop & SA_STOP) { siglist = &p->p_siglist; atomic_clearbits_int(siglist, CONTSIGMASK); atomic_clearbits_int(&p->p_flag, P_CONTINUED); } /* * XXX delay processing of SA_STOP signals unless action == SIG_DFL? */ if (prop & (SA_CONT | SA_STOP) && type != SPROPAGATED) TAILQ_FOREACH(q, &pr->ps_threads, p_thr_link) if (q != p) ptsignal(q, signum, SPROPAGATED); /* * Defer further processing for signals which are held, * except that stopped processes must be continued by SIGCONT. */ if (action == SIG_HOLD && ((prop & SA_CONT) == 0 || p->p_stat != SSTOP)) { atomic_setbits_int(siglist, mask); return; } SCHED_LOCK(s); switch (p->p_stat) { case SSLEEP: /* * If process is sleeping uninterruptibly * we can't interrupt the sleep... the signal will * be noticed when the process returns through * trap() or syscall(). */ if ((p->p_flag & P_SINTR) == 0) goto out; /* * Process is sleeping and traced... make it runnable * so it can discover the signal in cursig() and stop * for the parent. */ if (pr->ps_flags & PS_TRACED) goto run; /* * If SIGCONT is default (or ignored) and process is * asleep, we are finished; the process should not * be awakened. */ if ((prop & SA_CONT) && action == SIG_DFL) { mask = 0; goto out; } /* * When a sleeping process receives a stop * signal, process immediately if possible. */ if ((prop & SA_STOP) && action == SIG_DFL) { /* * If a child holding parent blocked, * stopping could cause deadlock. */ if (pr->ps_flags & PS_PPWAIT) goto out; mask = 0; pr->ps_xsig = signum; proc_stop(p, 0); goto out; } /* * All other (caught or default) signals * cause the process to run. */ goto runfast; /* NOTREACHED */ case SSTOP: /* * If traced process is already stopped, * then no further action is necessary. */ if (pr->ps_flags & PS_TRACED) goto out; /* * Kill signal always sets processes running. */ if (signum == SIGKILL) { atomic_clearbits_int(&p->p_flag, P_SUSPSIG); goto runfast; } if (prop & SA_CONT) { /* * If SIGCONT is default (or ignored), we continue the * process but don't leave the signal in p_siglist, as * it has no further action. If SIGCONT is held, we * continue the process and leave the signal in * p_siglist. If the process catches SIGCONT, let it * handle the signal itself. If it isn't waiting on * an event, then it goes back to run state. * Otherwise, process goes back to sleep state. */ atomic_setbits_int(&p->p_flag, P_CONTINUED); atomic_clearbits_int(&p->p_flag, P_SUSPSIG); wakeparent = 1; if (action == SIG_DFL) atomic_clearbits_int(siglist, mask); if (action == SIG_CATCH) goto runfast; if (p->p_wchan == NULL) goto run; p->p_stat = SSLEEP; goto out; } if (prop & SA_STOP) { /* * Already stopped, don't need to stop again. * (If we did the shell could get confused.) */ mask = 0; goto out; } /* * If process is sleeping interruptibly, then simulate a * wakeup so that when it is continued, it will be made * runnable and can look at the signal. But don't make * the process runnable, leave it stopped. */ if (p->p_flag & P_SINTR) unsleep(p); goto out; case SONPROC: /* set siglist before issuing the ast */ atomic_setbits_int(siglist, mask); mask = 0; signotify(p); /* FALLTHROUGH */ default: /* * SRUN, SIDL, SDEAD do nothing with the signal, * other than kicking ourselves if we are running. * It will either never be noticed, or noticed very soon. */ goto out; } /* NOTREACHED */ runfast: /* * Raise priority to at least PUSER. */ if (p->p_usrpri > PUSER) p->p_usrpri = PUSER; run: setrunnable(p); out: /* finally adjust siglist */ if (mask) atomic_setbits_int(siglist, mask); SCHED_UNLOCK(s); if (wakeparent) wakeup(pr->ps_pptr); } /* fill the signal context which should be used by postsig() and issignal() */ void setsigctx(struct proc *p, int signum, struct sigctx *sctx) { struct sigacts *ps = p->p_p->ps_sigacts; sigset_t mask; mtx_enter(&p->p_p->ps_mtx); mask = sigmask(signum); sctx->sig_action = ps->ps_sigact[signum]; sctx->sig_catchmask = ps->ps_catchmask[signum]; sctx->sig_reset = (ps->ps_sigreset & mask) != 0; sctx->sig_info = (ps->ps_siginfo & mask) != 0; sctx->sig_intr = (ps->ps_sigintr & mask) != 0; sctx->sig_onstack = (ps->ps_sigonstack & mask) != 0; sctx->sig_ignore = (ps->ps_sigignore & mask) != 0; sctx->sig_catch = (ps->ps_sigcatch & mask) != 0; mtx_leave(&p->p_p->ps_mtx); } /* * Determine signal that should be delivered to process p, the current * process, 0 if none. * * If the current process has received a signal (should be caught or cause * termination, should interrupt current syscall), return the signal number. * Stop signals with default action are processed immediately, then cleared; * they aren't returned. This is checked after each entry to the system for * a syscall or trap. The normal call sequence is * * while (signum = cursig(curproc, &ctx)) * postsig(signum, &ctx); * * Assumes that if the P_SINTR flag is set, we're holding both the * kernel and scheduler locks. */ int cursig(struct proc *p, struct sigctx *sctx) { struct process *pr = p->p_p; int signum, mask, prop; int dolock = (p->p_flag & P_SINTR) == 0; sigset_t ps_siglist; int s; KASSERT(p == curproc); for (;;) { ps_siglist = READ_ONCE(pr->ps_siglist); membar_consumer(); mask = SIGPENDING(p); if (pr->ps_flags & PS_PPWAIT) mask &= ~STOPSIGMASK; if (mask == 0) /* no signal to send */ return (0); signum = ffs((long)mask); mask = sigmask(signum); /* take the signal! */ if (atomic_cas_uint(&pr->ps_siglist, ps_siglist, ps_siglist & ~mask) != ps_siglist) { /* lost race taking the process signal, restart */ continue; } atomic_clearbits_int(&p->p_siglist, mask); setsigctx(p, signum, sctx); /* * We should see pending but ignored signals * only if PS_TRACED was on when they were posted. */ if (sctx->sig_ignore && (pr->ps_flags & PS_TRACED) == 0) continue; /* * If traced, always stop, and stay stopped until released * by the debugger. If our parent process is waiting for * us, don't hang as we could deadlock. */ if (((pr->ps_flags & (PS_TRACED | PS_PPWAIT)) == PS_TRACED) && signum != SIGKILL) { single_thread_set(p, SINGLE_SUSPEND, 0); pr->ps_xsig = signum; if (dolock) SCHED_LOCK(s); proc_stop(p, 1); if (dolock) SCHED_UNLOCK(s); /* * re-take the signal before releasing * the other threads. Must check the continue * conditions below and only take the signal if * those are not true. */ signum = pr->ps_xsig; mask = sigmask(signum); setsigctx(p, signum, sctx); if (!((pr->ps_flags & PS_TRACED) == 0 || signum == 0 || (p->p_sigmask & mask) != 0)) { atomic_clearbits_int(&p->p_siglist, mask); atomic_clearbits_int(&pr->ps_siglist, mask); } single_thread_clear(p, 0); /* * If we are no longer being traced, or the parent * didn't give us a signal, look for more signals. */ if ((pr->ps_flags & PS_TRACED) == 0 || signum == 0) continue; /* * If the new signal is being masked, look for other * signals. */ if ((p->p_sigmask & mask) != 0) continue; } prop = sigprop[signum]; /* * Decide whether the signal should be returned. * Return the signal's number, or fall through * to clear it from the pending mask. */ switch ((long)sctx->sig_action) { case (long)SIG_DFL: /* * Don't take default actions on system processes. */ if (pr->ps_pid <= 1) { #ifdef DIAGNOSTIC /* * Are you sure you want to ignore SIGSEGV * in init? XXX */ printf("Process (pid %d) got signal" " %d\n", pr->ps_pid, signum); #endif break; /* == ignore */ } /* * If there is a pending stop signal to process * with default action, stop here, * then clear the signal. However, * if process is member of an orphaned * process group, ignore tty stop signals. */ if (prop & SA_STOP) { if (pr->ps_flags & PS_TRACED || (pr->ps_pgrp->pg_jobc == 0 && prop & SA_TTYSTOP)) break; /* == ignore */ pr->ps_xsig = signum; if (dolock) SCHED_LOCK(s); proc_stop(p, 1); if (dolock) SCHED_UNLOCK(s); break; } else if (prop & SA_IGNORE) { /* * Except for SIGCONT, shouldn't get here. * Default action is to ignore; drop it. */ break; /* == ignore */ } else goto keep; /* NOTREACHED */ case (long)SIG_IGN: /* * Masking above should prevent us ever trying * to take action on an ignored signal other * than SIGCONT, unless process is traced. */ if ((prop & SA_CONT) == 0 && (pr->ps_flags & PS_TRACED) == 0) printf("%s\n", __func__); break; /* == ignore */ default: /* * This signal has an action, let * postsig() process it. */ goto keep; } } /* NOTREACHED */ keep: atomic_setbits_int(&p->p_siglist, mask); /*leave the signal for later */ return (signum); } /* * Put the argument process into the stopped state and notify the parent * via wakeup. Signals are handled elsewhere. The process must not be * on the run queue. */ void proc_stop(struct proc *p, int sw) { struct process *pr = p->p_p; #ifdef MULTIPROCESSOR SCHED_ASSERT_LOCKED(); #endif p->p_stat = SSTOP; atomic_clearbits_int(&pr->ps_flags, PS_WAITED); atomic_setbits_int(&pr->ps_flags, PS_STOPPED); atomic_setbits_int(&p->p_flag, P_SUSPSIG); /* * We need this soft interrupt to be handled fast. * Extra calls to softclock don't hurt. */ softintr_schedule(proc_stop_si); if (sw) mi_switch(); } /* * Called from a soft interrupt to send signals to the parents of stopped * processes. * We can't do this in proc_stop because it's called with nasty locks held * and we would need recursive scheduler lock to deal with that. */ void proc_stop_sweep(void *v) { struct process *pr; LIST_FOREACH(pr, &allprocess, ps_list) { if ((pr->ps_flags & PS_STOPPED) == 0) continue; atomic_clearbits_int(&pr->ps_flags, PS_STOPPED); if ((pr->ps_pptr->ps_sigacts->ps_sigflags & SAS_NOCLDSTOP) == 0) prsignal(pr->ps_pptr, SIGCHLD); wakeup(pr->ps_pptr); } } /* * Take the action for the specified signal * from the current set of pending signals. */ void postsig(struct proc *p, int signum, struct sigctx *sctx) { u_long trapno; int mask, returnmask; siginfo_t si; union sigval sigval; int code; KASSERT(signum != 0); mask = sigmask(signum); atomic_clearbits_int(&p->p_siglist, mask); sigval.sival_ptr = NULL; if (p->p_sisig != signum) { trapno = 0; code = SI_USER; sigval.sival_ptr = NULL; } else { trapno = p->p_sitrapno; code = p->p_sicode; sigval = p->p_sigval; } initsiginfo(&si, signum, trapno, code, sigval); #ifdef KTRACE if (KTRPOINT(p, KTR_PSIG)) { ktrpsig(p, signum, sctx->sig_action, p->p_flag & P_SIGSUSPEND ? p->p_oldmask : p->p_sigmask, code, &si); } #endif if (sctx->sig_action == SIG_DFL) { /* * Default action, where the default is to kill * the process. (Other cases were ignored above.) */ KERNEL_LOCK(); sigexit(p, signum); /* NOTREACHED */ } else { /* * If we get here, the signal must be caught. */ #ifdef DIAGNOSTIC if (sctx->sig_action == SIG_IGN || (p->p_sigmask & mask)) panic("postsig action"); #endif /* * Set the new mask value and also defer further * occurrences of this signal. * * Special case: user has done a sigpause. Here the * current mask is not of interest, but rather the * mask from before the sigpause is what we want * restored after the signal processing is completed. */ if (p->p_flag & P_SIGSUSPEND) { atomic_clearbits_int(&p->p_flag, P_SIGSUSPEND); returnmask = p->p_oldmask; } else { returnmask = p->p_sigmask; } if (p->p_sisig == signum) { p->p_sisig = 0; p->p_sitrapno = 0; p->p_sicode = SI_USER; p->p_sigval.sival_ptr = NULL; } if (sendsig(sctx->sig_action, signum, returnmask, &si, sctx->sig_info, sctx->sig_onstack)) { KERNEL_LOCK(); sigexit(p, SIGILL); /* NOTREACHED */ } postsig_done(p, signum, sctx->sig_catchmask, sctx->sig_reset); } } /* * Force the current process to exit with the specified signal, dumping core * if appropriate. We bypass the normal tests for masked and caught signals, * allowing unrecoverable failures to terminate the process without changing * signal state. Mark the accounting record with the signal termination. * If dumping core, save the signal number for the debugger. Calls exit and * does not return. */ void sigexit(struct proc *p, int signum) { /* Mark process as going away */ atomic_setbits_int(&p->p_flag, P_WEXIT); p->p_p->ps_acflag |= AXSIG; if (sigprop[signum] & SA_CORE) { p->p_sisig = signum; /* if there are other threads, pause them */ if (P_HASSIBLING(p)) single_thread_set(p, SINGLE_SUSPEND, 1); if (coredump(p) == 0) signum |= WCOREFLAG; } exit1(p, 0, signum, EXIT_NORMAL); /* NOTREACHED */ } /* * Send uncatchable SIGABRT for coredump. */ void sigabort(struct proc *p) { struct sigaction sa; memset(&sa, 0, sizeof sa); sa.sa_handler = SIG_DFL; setsigvec(p, SIGABRT, &sa); atomic_clearbits_int(&p->p_sigmask, sigmask(SIGABRT)); psignal(p, SIGABRT); } /* * Return 1 if `sig', a given signal, is ignored or masked for `p', a given * thread, and 0 otherwise. */ int sigismasked(struct proc *p, int sig) { struct process *pr = p->p_p; int rv; mtx_enter(&pr->ps_mtx); rv = (pr->ps_sigacts->ps_sigignore & sigmask(sig)) || (p->p_sigmask & sigmask(sig)); mtx_leave(&pr->ps_mtx); return !!rv; } struct coredump_iostate { struct proc *io_proc; struct vnode *io_vp; struct ucred *io_cred; off_t io_offset; }; /* * Dump core, into a file named "progname.core", unless the process was * setuid/setgid. */ int coredump(struct proc *p) { #ifdef SMALL_KERNEL return EPERM; #else struct process *pr = p->p_p; struct vnode *vp; struct ucred *cred = p->p_ucred; struct vmspace *vm = p->p_vmspace; struct nameidata nd; struct vattr vattr; struct coredump_iostate io; int error, len, incrash = 0; char *name; const char *dir = "/var/crash"; atomic_setbits_int(&pr->ps_flags, PS_COREDUMP); /* Don't dump if will exceed file size limit. */ if (USPACE + ptoa(vm->vm_dsize + vm->vm_ssize) >= lim_cur(RLIMIT_CORE)) return (EFBIG); name = pool_get(&namei_pool, PR_WAITOK); /* * If the process has inconsistent uids, nosuidcoredump * determines coredump placement policy. */ if (((pr->ps_flags & PS_SUGID) && (error = suser(p))) || ((pr->ps_flags & PS_SUGID) && nosuidcoredump)) { if (nosuidcoredump == 3) { /* * If the program directory does not exist, dumps of * that core will silently fail. */ len = snprintf(name, MAXPATHLEN, "%s/%s/%u.core", dir, pr->ps_comm, pr->ps_pid); incrash = KERNELPATH; } else if (nosuidcoredump == 2) { len = snprintf(name, MAXPATHLEN, "%s/%s.core", dir, pr->ps_comm); incrash = KERNELPATH; } else { pool_put(&namei_pool, name); return (EPERM); } } else len = snprintf(name, MAXPATHLEN, "%s.core", pr->ps_comm); if (len >= MAXPATHLEN) { pool_put(&namei_pool, name); return (EACCES); } /* * Control the UID used to write out. The normal case uses * the real UID. If the sugid case is going to write into the * controlled directory, we do so as root. */ if (incrash == 0) { cred = crdup(cred); cred->cr_uid = cred->cr_ruid; cred->cr_gid = cred->cr_rgid; } else { if (p->p_fd->fd_rdir) { vrele(p->p_fd->fd_rdir); p->p_fd->fd_rdir = NULL; } p->p_ucred = crdup(p->p_ucred); crfree(cred); cred = p->p_ucred; crhold(cred); cred->cr_uid = 0; cred->cr_gid = 0; } /* incrash should be 0 or KERNELPATH only */ NDINIT(&nd, 0, incrash, UIO_SYSSPACE, name, p); error = vn_open(&nd, O_CREAT | FWRITE | O_NOFOLLOW | O_NONBLOCK, S_IRUSR | S_IWUSR); if (error) goto out; /* * Don't dump to non-regular files, files with links, or files * owned by someone else. */ vp = nd.ni_vp; if ((error = VOP_GETATTR(vp, &vattr, cred, p)) != 0) { VOP_UNLOCK(vp); vn_close(vp, FWRITE, cred, p); goto out; } if (vp->v_type != VREG || vattr.va_nlink != 1 || vattr.va_mode & ((VREAD | VWRITE) >> 3 | (VREAD | VWRITE) >> 6) || vattr.va_uid != cred->cr_uid) { error = EACCES; VOP_UNLOCK(vp); vn_close(vp, FWRITE, cred, p); goto out; } VATTR_NULL(&vattr); vattr.va_size = 0; VOP_SETATTR(vp, &vattr, cred, p); pr->ps_acflag |= ACORE; io.io_proc = p; io.io_vp = vp; io.io_cred = cred; io.io_offset = 0; VOP_UNLOCK(vp); vref(vp); error = vn_close(vp, FWRITE, cred, p); if (error == 0) error = coredump_elf(p, &io); vrele(vp); out: crfree(cred); pool_put(&namei_pool, name); return (error); #endif } #ifndef SMALL_KERNEL int coredump_write(void *cookie, enum uio_seg segflg, const void *data, size_t len) { struct coredump_iostate *io = cookie; off_t coffset = 0; size_t csize; int chunk, error; csize = len; do { if (sigmask(SIGKILL) & (io->io_proc->p_siglist | io->io_proc->p_p->ps_siglist)) return (EINTR); /* Rest of the loop sleeps with lock held, so... */ yield(); chunk = MIN(csize, MAXPHYS); error = vn_rdwr(UIO_WRITE, io->io_vp, (caddr_t)data + coffset, chunk, io->io_offset + coffset, segflg, IO_UNIT, io->io_cred, NULL, io->io_proc); if (error) { struct process *pr = io->io_proc->p_p; if (error == ENOSPC) log(LOG_ERR, "coredump of %s(%d) failed, filesystem full\n", pr->ps_comm, pr->ps_pid); else log(LOG_ERR, "coredump of %s(%d), write failed: errno %d\n", pr->ps_comm, pr->ps_pid, error); return (error); } coffset += chunk; csize -= chunk; } while (csize > 0); io->io_offset += len; return (0); } void coredump_unmap(void *cookie, vaddr_t start, vaddr_t end) { struct coredump_iostate *io = cookie; uvm_unmap(&io->io_proc->p_vmspace->vm_map, start, end); } #endif /* !SMALL_KERNEL */ /* * Nonexistent system call-- signal process (may want to handle it). * Flag error in case process won't see signal immediately (blocked or ignored). */ int sys_nosys(struct proc *p, void *v, register_t *retval) { ptsignal(p, SIGSYS, STHREAD); return (ENOSYS); } int sys___thrsigdivert(struct proc *p, void *v, register_t *retval) { static int sigwaitsleep; struct sys___thrsigdivert_args /* { syscallarg(sigset_t) sigmask; syscallarg(siginfo_t *) info; syscallarg(const struct timespec *) timeout; } */ *uap = v; struct sigctx ctx; sigset_t mask = SCARG(uap, sigmask) &~ sigcantmask; siginfo_t si; uint64_t nsecs = INFSLP; int timeinvalid = 0; int error = 0; memset(&si, 0, sizeof(si)); if (SCARG(uap, timeout) != NULL) { struct timespec ts; if ((error = copyin(SCARG(uap, timeout), &ts, sizeof(ts))) != 0) return (error); #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktrreltimespec(p, &ts); #endif if (!timespecisvalid(&ts)) timeinvalid = 1; else nsecs = TIMESPEC_TO_NSEC(&ts); } dosigsuspend(p, p->p_sigmask &~ mask); for (;;) { si.si_signo = cursig(p, &ctx); if (si.si_signo != 0) { sigset_t smask = sigmask(si.si_signo); if (smask & mask) { atomic_clearbits_int(&p->p_siglist, smask); error = 0; break; } } /* per-POSIX, delay this error until after the above */ if (timeinvalid) error = EINVAL; /* per-POSIX, return immediately if timeout is zero-valued */ if (nsecs == 0) error = EAGAIN; if (error != 0) break; error = tsleep_nsec(&sigwaitsleep, PPAUSE|PCATCH, "sigwait", nsecs); } if (error == 0) { *retval = si.si_signo; if (SCARG(uap, info) != NULL) error = copyout(&si, SCARG(uap, info), sizeof(si)); } else if (error == ERESTART && SCARG(uap, timeout) != NULL) { /* * Restarting is wrong if there's a timeout, as it'll be * for the same interval again */ error = EINTR; } return (error); } void initsiginfo(siginfo_t *si, int sig, u_long trapno, int code, union sigval val) { memset(si, 0, sizeof(*si)); si->si_signo = sig; si->si_code = code; if (code == SI_USER) { si->si_value = val; } else { switch (sig) { case SIGSEGV: case SIGILL: case SIGBUS: case SIGFPE: si->si_addr = val.sival_ptr; si->si_trapno = trapno; break; case SIGXFSZ: break; } } } int filt_sigattach(struct knote *kn) { struct process *pr = curproc->p_p; int s; if (kn->kn_id >= NSIG) return EINVAL; kn->kn_ptr.p_process = pr; kn->kn_flags |= EV_CLEAR; /* automatically set */ s = splhigh(); klist_insert_locked(&pr->ps_klist, kn); splx(s); return (0); } void filt_sigdetach(struct knote *kn) { struct process *pr = kn->kn_ptr.p_process; int s; s = splhigh(); klist_remove_locked(&pr->ps_klist, kn); splx(s); } /* * signal knotes are shared with proc knotes, so we apply a mask to * the hint in order to differentiate them from process hints. This * could be avoided by using a signal-specific knote list, but probably * isn't worth the trouble. */ int filt_signal(struct knote *kn, long hint) { if (hint & NOTE_SIGNAL) { hint &= ~NOTE_SIGNAL; if (kn->kn_id == hint) kn->kn_data++; } return (kn->kn_data != 0); } void userret(struct proc *p) { struct sigctx ctx; int signum; /* send SIGPROF or SIGVTALRM if their timers interrupted this thread */ if (p->p_flag & P_PROFPEND) { atomic_clearbits_int(&p->p_flag, P_PROFPEND); KERNEL_LOCK(); psignal(p, SIGPROF); KERNEL_UNLOCK(); } if (p->p_flag & P_ALRMPEND) { atomic_clearbits_int(&p->p_flag, P_ALRMPEND); KERNEL_LOCK(); psignal(p, SIGVTALRM); KERNEL_UNLOCK(); } if (SIGPENDING(p) != 0) { while ((signum = cursig(p, &ctx)) != 0) postsig(p, signum, &ctx); } /* * If P_SIGSUSPEND is still set here, then we still need to restore * the original sigmask before returning to userspace. Also, this * might unmask some pending signals, so we need to check a second * time for signals to post. */ if (p->p_flag & P_SIGSUSPEND) { atomic_clearbits_int(&p->p_flag, P_SIGSUSPEND); p->p_sigmask = p->p_oldmask; while ((signum = cursig(p, &ctx)) != 0) postsig(p, signum, &ctx); } if (p->p_flag & P_SUSPSINGLE) single_thread_check(p, 0); WITNESS_WARN(WARN_PANIC, NULL, "userret: returning"); p->p_cpu->ci_schedstate.spc_curpriority = p->p_usrpri; } int single_thread_check_locked(struct proc *p, int deep, int s) { struct process *pr = p->p_p; SCHED_ASSERT_LOCKED(); if (pr->ps_single != NULL && pr->ps_single != p) { do { /* if we're in deep, we need to unwind to the edge */ if (deep) { if (pr->ps_flags & PS_SINGLEUNWIND) return (ERESTART); if (pr->ps_flags & PS_SINGLEEXIT) return (EINTR); } if (atomic_dec_int_nv(&pr->ps_singlecount) == 0) wakeup(&pr->ps_singlecount); if (pr->ps_flags & PS_SINGLEEXIT) { SCHED_UNLOCK(s); KERNEL_LOCK(); exit1(p, 0, 0, EXIT_THREAD_NOCHECK); /* NOTREACHED */ } /* not exiting and don't need to unwind, so suspend */ p->p_stat = SSTOP; mi_switch(); } while (pr->ps_single != NULL); } return (0); } int single_thread_check(struct proc *p, int deep) { int s, error; SCHED_LOCK(s); error = single_thread_check_locked(p, deep, s); SCHED_UNLOCK(s); return error; } /* * Stop other threads in the process. The mode controls how and * where the other threads should stop: * - SINGLE_SUSPEND: stop wherever they are, will later either be told to exit * (by setting to SINGLE_EXIT) or be released (via single_thread_clear()) * - SINGLE_UNWIND: just unwind to kernel boundary, will be told to exit * or released as with SINGLE_SUSPEND * - SINGLE_EXIT: unwind to kernel boundary and exit */ int single_thread_set(struct proc *p, enum single_thread_mode mode, int wait) { struct process *pr = p->p_p; struct proc *q; int error, s; KASSERT(curproc == p); SCHED_LOCK(s); error = single_thread_check_locked(p, (mode == SINGLE_UNWIND), s); if (error) { SCHED_UNLOCK(s); return error; } switch (mode) { case SINGLE_SUSPEND: break; case SINGLE_UNWIND: atomic_setbits_int(&pr->ps_flags, PS_SINGLEUNWIND); break; case SINGLE_EXIT: atomic_setbits_int(&pr->ps_flags, PS_SINGLEEXIT); atomic_clearbits_int(&pr->ps_flags, PS_SINGLEUNWIND); break; #ifdef DIAGNOSTIC default: panic("single_thread_mode = %d", mode); #endif } pr->ps_singlecount = 0; membar_producer(); pr->ps_single = p; TAILQ_FOREACH(q, &pr->ps_threads, p_thr_link) { if (q == p) continue; if (q->p_flag & P_WEXIT) { if (mode == SINGLE_EXIT) { if (q->p_stat == SSTOP) { setrunnable(q); atomic_inc_int(&pr->ps_singlecount); } } continue; } atomic_setbits_int(&q->p_flag, P_SUSPSINGLE); switch (q->p_stat) { case SIDL: case SRUN: atomic_inc_int(&pr->ps_singlecount); break; case SSLEEP: /* if it's not interruptible, then just have to wait */ if (q->p_flag & P_SINTR) { /* merely need to suspend? just stop it */ if (mode == SINGLE_SUSPEND) { q->p_stat = SSTOP; break; } /* need to unwind or exit, so wake it */ setrunnable(q); } atomic_inc_int(&pr->ps_singlecount); break; case SSTOP: if (mode == SINGLE_EXIT) { setrunnable(q); atomic_inc_int(&pr->ps_singlecount); } break; case SDEAD: break; case SONPROC: atomic_inc_int(&pr->ps_singlecount); signotify(q); break; } } SCHED_UNLOCK(s); if (wait) single_thread_wait(pr, 1); return 0; } /* * Wait for other threads to stop. If recheck is false then the function * returns non-zero if the caller needs to restart the check else 0 is * returned. If recheck is true the return value is always 0. */ int single_thread_wait(struct process *pr, int recheck) { struct sleep_state sls; int wait; /* wait until they're all suspended */ wait = pr->ps_singlecount > 0; while (wait) { sleep_setup(&sls, &pr->ps_singlecount, PWAIT, "suspend", 0); wait = pr->ps_singlecount > 0; sleep_finish(&sls, wait); if (!recheck) break; } return wait; } void single_thread_clear(struct proc *p, int flag) { struct process *pr = p->p_p; struct proc *q; int s; KASSERT(pr->ps_single == p); KASSERT(curproc == p); SCHED_LOCK(s); pr->ps_single = NULL; atomic_clearbits_int(&pr->ps_flags, PS_SINGLEUNWIND | PS_SINGLEEXIT); TAILQ_FOREACH(q, &pr->ps_threads, p_thr_link) { if (q == p || (q->p_flag & P_SUSPSINGLE) == 0) continue; atomic_clearbits_int(&q->p_flag, P_SUSPSINGLE); /* * if the thread was only stopped for single threading * then clearing that either makes it runnable or puts * it back into some sleep queue */ if (q->p_stat == SSTOP && (q->p_flag & flag) == 0) { if (q->p_wchan == NULL) setrunnable(q); else q->p_stat = SSLEEP; } } SCHED_UNLOCK(s); } void sigio_del(struct sigiolst *rmlist) { struct sigio *sigio; while ((sigio = LIST_FIRST(rmlist)) != NULL) { LIST_REMOVE(sigio, sio_pgsigio); crfree(sigio->sio_ucred); free(sigio, M_SIGIO, sizeof(*sigio)); } } void sigio_unlink(struct sigio_ref *sir, struct sigiolst *rmlist) { struct sigio *sigio; MUTEX_ASSERT_LOCKED(&sigio_lock); sigio = sir->sir_sigio; if (sigio != NULL) { KASSERT(sigio->sio_myref == sir); sir->sir_sigio = NULL; if (sigio->sio_pgid > 0) sigio->sio_proc = NULL; else sigio->sio_pgrp = NULL; LIST_REMOVE(sigio, sio_pgsigio); LIST_INSERT_HEAD(rmlist, sigio, sio_pgsigio); } } void sigio_free(struct sigio_ref *sir) { struct sigiolst rmlist; if (sir->sir_sigio == NULL) return; LIST_INIT(&rmlist); mtx_enter(&sigio_lock); sigio_unlink(sir, &rmlist); mtx_leave(&sigio_lock); sigio_del(&rmlist); } void sigio_freelist(struct sigiolst *sigiolst) { struct sigiolst rmlist; struct sigio *sigio; if (LIST_EMPTY(sigiolst)) return; LIST_INIT(&rmlist); mtx_enter(&sigio_lock); while ((sigio = LIST_FIRST(sigiolst)) != NULL) sigio_unlink(sigio->sio_myref, &rmlist); mtx_leave(&sigio_lock); sigio_del(&rmlist); } int sigio_setown(struct sigio_ref *sir, u_long cmd, caddr_t data) { struct sigiolst rmlist; struct proc *p = curproc; struct pgrp *pgrp = NULL; struct process *pr = NULL; struct sigio *sigio; int error; pid_t pgid = *(int *)data; if (pgid == 0) { sigio_free(sir); return (0); } if (cmd == TIOCSPGRP) { if (pgid < 0) return (EINVAL); pgid = -pgid; } sigio = malloc(sizeof(*sigio), M_SIGIO, M_WAITOK); sigio->sio_pgid = pgid; sigio->sio_ucred = crhold(p->p_ucred); sigio->sio_myref = sir; LIST_INIT(&rmlist); /* * The kernel lock, and not sleeping between prfind()/pgfind() and * linking of the sigio ensure that the process or process group does * not disappear unexpectedly. */ KERNEL_LOCK(); mtx_enter(&sigio_lock); if (pgid > 0) { pr = prfind(pgid); if (pr == NULL) { error = ESRCH; goto fail; } /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ if (pr->ps_session != p->p_p->ps_session) { error = EPERM; goto fail; } if ((pr->ps_flags & PS_EXITING) != 0) { error = ESRCH; goto fail; } } else /* if (pgid < 0) */ { pgrp = pgfind(-pgid); if (pgrp == NULL) { error = ESRCH; goto fail; } /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ if (pgrp->pg_session != p->p_p->ps_session) { error = EPERM; goto fail; } } if (pgid > 0) { sigio->sio_proc = pr; LIST_INSERT_HEAD(&pr->ps_sigiolst, sigio, sio_pgsigio); } else { sigio->sio_pgrp = pgrp; LIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); } sigio_unlink(sir, &rmlist); sir->sir_sigio = sigio; mtx_leave(&sigio_lock); KERNEL_UNLOCK(); sigio_del(&rmlist); return (0); fail: mtx_leave(&sigio_lock); KERNEL_UNLOCK(); crfree(sigio->sio_ucred); free(sigio, M_SIGIO, sizeof(*sigio)); return (error); } void sigio_getown(struct sigio_ref *sir, u_long cmd, caddr_t data) { struct sigio *sigio; pid_t pgid = 0; mtx_enter(&sigio_lock); sigio = sir->sir_sigio; if (sigio != NULL) pgid = sigio->sio_pgid; mtx_leave(&sigio_lock); if (cmd == TIOCGPGRP) pgid = -pgid; *(int *)data = pgid; } void sigio_copy(struct sigio_ref *dst, struct sigio_ref *src) { struct sigiolst rmlist; struct sigio *newsigio, *sigio; sigio_free(dst); if (src->sir_sigio == NULL) return; newsigio = malloc(sizeof(*newsigio), M_SIGIO, M_WAITOK); LIST_INIT(&rmlist); mtx_enter(&sigio_lock); sigio = src->sir_sigio; if (sigio == NULL) { mtx_leave(&sigio_lock); free(newsigio, M_SIGIO, sizeof(*newsigio)); return; } newsigio->sio_pgid = sigio->sio_pgid; newsigio->sio_ucred = crhold(sigio->sio_ucred); newsigio->sio_myref = dst; if (newsigio->sio_pgid > 0) { newsigio->sio_proc = sigio->sio_proc; LIST_INSERT_HEAD(&newsigio->sio_proc->ps_sigiolst, newsigio, sio_pgsigio); } else { newsigio->sio_pgrp = sigio->sio_pgrp; LIST_INSERT_HEAD(&newsigio->sio_pgrp->pg_sigiolst, newsigio, sio_pgsigio); } sigio_unlink(dst, &rmlist); dst->sir_sigio = newsigio; mtx_leave(&sigio_lock); sigio_del(&rmlist); }
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 /* $OpenBSD: in_var.h,v 1.41 2018/10/18 15:23:04 cheloha Exp $ */ /* $NetBSD: in_var.h,v 1.16 1996/02/13 23:42:15 christos Exp $ */ /* * Copyright (c) 1985, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_var.h 8.1 (Berkeley) 6/10/93 */ #ifndef _NETINET_IN_VAR_H_ #define _NETINET_IN_VAR_H_ #include <sys/queue.h> #ifdef _KERNEL /* * Interface address, Internet version. One of these structures * is allocated for each interface with an Internet address. * The ifaddr structure contains the protocol-independent part * of the structure and is assumed to be first. */ struct in_ifaddr { struct ifaddr ia_ifa; /* protocol-independent info */ #define ia_ifp ia_ifa.ifa_ifp #define ia_flags ia_ifa.ifa_flags /* ia_net{,mask} in host order */ u_int32_t ia_net; /* network number of interface */ u_int32_t ia_netmask; /* mask of net part */ TAILQ_ENTRY(in_ifaddr) ia_list; /* list of internet addresses */ struct sockaddr_in ia_addr; /* reserve space for interface name */ struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */ #define ia_broadaddr ia_dstaddr struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ struct in_multi *ia_allhosts; /* multicast address record for the allhosts multicast group */ }; #endif struct in_aliasreq { char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ union { struct sockaddr_in ifrau_addr; int ifrau_align; } ifra_ifrau; #ifndef ifra_addr #define ifra_addr ifra_ifrau.ifrau_addr #endif struct sockaddr_in ifra_dstaddr; #define ifra_broadaddr ifra_dstaddr struct sockaddr_in ifra_mask; }; #ifdef _KERNEL /* * Macro for finding the internet address structure (in_ifaddr) corresponding * to a given interface (ifnet structure). */ #define IFP_TO_IA(ifp, ia) \ /* struct ifnet *ifp; */ \ /* struct in_ifaddr *ia; */ \ do { \ struct ifaddr *ifa; \ NET_ASSERT_LOCKED(); \ TAILQ_FOREACH(ifa, &(ifp)->if_addrlist, ifa_list) { \ if (ifa->ifa_addr->sa_family == AF_INET) \ break; \ } \ (ia) = ifatoia(ifa); \ } while (/* CONSTCOND */ 0) #endif /* * Per-interface router version information. */ struct router_info { unsigned int rti_ifidx; int rti_type; /* type of router on this interface */ int rti_age; /* time since last v1 query */ LIST_ENTRY(router_info) rti_list; }; #ifdef _KERNEL /* * Internet multicast address structure. There is one of these for each IP * multicast group to which this host belongs on a given network interface. */ struct in_multi { struct ifmaddr inm_ifma; /* Protocol-independent info */ #define inm_refcnt inm_ifma.ifma_refcnt #define inm_ifidx inm_ifma.ifma_ifidx struct sockaddr_in inm_sin; /* IPv4 multicast address */ #define inm_addr inm_sin.sin_addr u_int inm_state; /* state of membership */ u_int inm_timer; /* IGMP membership report timer */ struct router_info *inm_rti; /* router version info */ }; static __inline struct in_multi * ifmatoinm(struct ifmaddr *ifma) { return ((struct in_multi *)(ifma)); } /* * Macro for looking up the in_multi record for a given IP multicast * address on a given interface. If no matching record is found, "inm" * returns NULL. */ #define IN_LOOKUP_MULTI(addr, ifp, inm) \ /* struct in_addr addr; */ \ /* struct ifnet *ifp; */ \ /* struct in_multi *inm; */ \ do { \ struct ifmaddr *ifma; \ \ (inm) = NULL; \ NET_ASSERT_LOCKED(); \ TAILQ_FOREACH(ifma, &(ifp)->if_maddrlist, ifma_list) \ if (ifma->ifma_addr->sa_family == AF_INET && \ ifmatoinm(ifma)->inm_addr.s_addr == (addr).s_addr) {\ (inm) = ifmatoinm(ifma); \ break; \ } \ } while (/* CONSTCOND */ 0) int in_ifinit(struct ifnet *, struct in_ifaddr *, struct sockaddr_in *, int); struct in_multi *in_addmulti(struct in_addr *, struct ifnet *); void in_delmulti(struct in_multi *); int in_hasmulti(struct in_addr *, struct ifnet *); void in_ifscrub(struct ifnet *, struct in_ifaddr *); int in_control(struct socket *, u_long, caddr_t, struct ifnet *); int in_ioctl(u_long, caddr_t, struct ifnet *, int); void in_prefixlen2mask(struct in_addr *, int); #endif #endif /* _NETINET_IN_VAR_H_ */
6 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 /* $OpenBSD: if_etherip.c,v 1.50 2022/02/28 00:12:11 dlg Exp $ */ /* * Copyright (c) 2015 Kazuya GODA <goda@openbsd.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "bpfilter.h" #include "pf.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/ioctl.h> #include <sys/device.h> #include <sys/sysctl.h> #include <sys/tree.h> #include <net/if.h> #include <net/if_var.h> #include <net/if_dl.h> #include <net/if_media.h> #include <net/rtable.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/if_ether.h> #include <netinet/ip_ether.h> #ifdef INET6 #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #endif #if NBPFILTER > 0 #include <net/bpf.h> #endif #if NPF > 0 #include <net/pfvar.h> #endif #include <net/if_etherip.h> union etherip_addr { struct in_addr in4; struct in6_addr in6; }; struct etherip_tunnel { union etherip_addr _t_src; #define t_src4 _t_src.in4 #define t_src6 _t_src.in6 union etherip_addr _t_dst; #define t_dst4 _t_dst.in4 #define t_dst6 _t_dst.in6 unsigned int t_rtableid; sa_family_t t_af; uint8_t t_tos; TAILQ_ENTRY(etherip_tunnel) t_entry; }; TAILQ_HEAD(etherip_list, etherip_tunnel); static inline int etherip_cmp(const struct etherip_tunnel *, const struct etherip_tunnel *); struct etherip_softc { struct etherip_tunnel sc_tunnel; /* must be first */ struct arpcom sc_ac; struct ifmedia sc_media; int sc_txhprio; int sc_rxhprio; uint16_t sc_df; uint8_t sc_ttl; }; /* * We can control the acceptance of EtherIP packets by altering the sysctl * net.inet.etherip.allow value. Zero means drop them, all else is acceptance. */ int etherip_allow = 0; struct cpumem *etheripcounters; void etheripattach(int); int etherip_clone_create(struct if_clone *, int); int etherip_clone_destroy(struct ifnet *); int etherip_ioctl(struct ifnet *, u_long, caddr_t); int etherip_output(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); void etherip_start(struct ifnet *); int etherip_media_change(struct ifnet *); void etherip_media_status(struct ifnet *, struct ifmediareq *); int etherip_set_tunnel(struct etherip_softc *, struct if_laddrreq *); int etherip_get_tunnel(struct etherip_softc *, struct if_laddrreq *); int etherip_del_tunnel(struct etherip_softc *); int etherip_up(struct etherip_softc *); int etherip_down(struct etherip_softc *); struct etherip_softc *etherip_find(const struct etherip_tunnel *); int etherip_input(struct etherip_tunnel *, struct mbuf *, uint8_t, int); struct if_clone etherip_cloner = IF_CLONE_INITIALIZER("etherip", etherip_clone_create, etherip_clone_destroy); struct etherip_list etherip_list = TAILQ_HEAD_INITIALIZER(etherip_list); void etheripattach(int count) { if_clone_attach(&etherip_cloner); etheripcounters = counters_alloc(etherips_ncounters); } int etherip_clone_create(struct if_clone *ifc, int unit) { struct ifnet *ifp; struct etherip_softc *sc; sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); ifp = &sc->sc_ac.ac_if; snprintf(ifp->if_xname, sizeof(ifp->if_xname), "%s%d", ifc->ifc_name, unit); sc->sc_ttl = ip_defttl; sc->sc_txhprio = IFQ_TOS2PRIO(IPTOS_PREC_ROUTINE); /* 0 */ sc->sc_rxhprio = IF_HDRPRIO_PACKET; sc->sc_df = htons(0); ifp->if_softc = sc; ifp->if_hardmtu = ETHER_MAX_HARDMTU_LEN; ifp->if_ioctl = etherip_ioctl; ifp->if_output = etherip_output; ifp->if_start = etherip_start; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_xflags = IFXF_CLONED; ifp->if_capabilities = IFCAP_VLAN_MTU; ether_fakeaddr(ifp); ifmedia_init(&sc->sc_media, 0, etherip_media_change, etherip_media_status); ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO); if_counters_alloc(ifp); if_attach(ifp); ether_ifattach(ifp); NET_LOCK(); TAILQ_INSERT_TAIL(&etherip_list, &sc->sc_tunnel, t_entry); NET_UNLOCK(); return (0); } int etherip_clone_destroy(struct ifnet *ifp) { struct etherip_softc *sc = ifp->if_softc; NET_LOCK(); if (ISSET(ifp->if_flags, IFF_RUNNING)) etherip_down(sc); TAILQ_REMOVE(&etherip_list, &sc->sc_tunnel, t_entry); NET_UNLOCK(); ifmedia_delete_instance(&sc->sc_media, IFM_INST_ANY); ether_ifdetach(ifp); if_detach(ifp); free(sc, M_DEVBUF, sizeof(*sc)); return (0); } int etherip_media_change(struct ifnet *ifp) { return 0; } void etherip_media_status(struct ifnet *ifp, struct ifmediareq *imr) { imr->ifm_active = IFM_ETHER | IFM_AUTO; imr->ifm_status = IFM_AVALID | IFM_ACTIVE; } int etherip_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct rtentry *rt) { struct m_tag *mtag; mtag = NULL; while ((mtag = m_tag_find(m, PACKET_TAG_GRE, mtag)) != NULL) { if (*(int *)(mtag + 1) == ifp->if_index) { m_freem(m); return (EIO); } } return (ether_output(ifp, m, dst, rt)); } void etherip_start(struct ifnet *ifp) { struct etherip_softc *sc = ifp->if_softc; struct mbuf *m; int error; #if NBPFILTER > 0 caddr_t if_bpf; #endif while ((m = ifq_dequeue(&ifp->if_snd)) != NULL) { #if NBPFILTER > 0 if_bpf = ifp->if_bpf; if (if_bpf) bpf_mtap_ether(if_bpf, m, BPF_DIRECTION_OUT); #endif switch (sc->sc_tunnel.t_af) { case AF_INET: error = ip_etherip_output(ifp, m); break; #ifdef INET6 case AF_INET6: error = ip6_etherip_output(ifp, m); break; #endif default: /* unhandled_af(sc->sc_tunnel.t_af); */ m_freem(m); continue; } if (error) ifp->if_oerrors++; } } int etherip_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct etherip_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; int error = 0; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; /* FALLTHROUGH */ case SIOCSIFFLAGS: if (ISSET(ifp->if_flags, IFF_UP)) { if (!ISSET(ifp->if_flags, IFF_RUNNING)) error = etherip_up(sc); else error = 0; } else { if (ISSET(ifp->if_flags, IFF_RUNNING)) error = etherip_down(sc); } break; case SIOCSLIFPHYRTABLE: if (ifr->ifr_rdomainid < 0 || ifr->ifr_rdomainid > RT_TABLEID_MAX || !rtable_exists(ifr->ifr_rdomainid)) { error = EINVAL; break; } sc->sc_tunnel.t_rtableid = ifr->ifr_rdomainid; break; case SIOCGLIFPHYRTABLE: ifr->ifr_rdomainid = sc->sc_tunnel.t_rtableid; break; case SIOCSLIFPHYADDR: error = etherip_set_tunnel(sc, (struct if_laddrreq *)data); break; case SIOCGLIFPHYADDR: error = etherip_get_tunnel(sc, (struct if_laddrreq *)data); break; case SIOCDIFPHYADDR: error = etherip_del_tunnel(sc); break; case SIOCSTXHPRIO: error = if_txhprio_l2_check(ifr->ifr_hdrprio); if (error != 0) break; sc->sc_txhprio = ifr->ifr_hdrprio; break; case SIOCGTXHPRIO: ifr->ifr_hdrprio = sc->sc_txhprio; break; case SIOCSRXHPRIO: error = if_rxhprio_l2_check(ifr->ifr_hdrprio); if (error != 0) break; sc->sc_rxhprio = ifr->ifr_hdrprio; break; case SIOCGRXHPRIO: ifr->ifr_hdrprio = sc->sc_rxhprio; break; case SIOCSLIFPHYTTL: if (ifr->ifr_ttl < 1 || ifr->ifr_ttl > 0xff) { error = EINVAL; break; } /* commit */ sc->sc_ttl = (uint8_t)ifr->ifr_ttl; break; case SIOCGLIFPHYTTL: ifr->ifr_ttl = (int)sc->sc_ttl; break; case SIOCSLIFPHYDF: /* commit */ sc->sc_df = ifr->ifr_df ? htons(IP_DF) : htons(0); break; case SIOCGLIFPHYDF: ifr->ifr_df = sc->sc_df ? 1 : 0; break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd); break; case SIOCADDMULTI: case SIOCDELMULTI: break; default: error = ether_ioctl(ifp, &sc->sc_ac, cmd, data); break; } if (error == ENETRESET) { /* no hardware to program */ error = 0; } return (error); } int etherip_set_tunnel(struct etherip_softc *sc, struct if_laddrreq *req) { struct sockaddr *src = (struct sockaddr *)&req->addr; struct sockaddr *dst = (struct sockaddr *)&req->dstaddr; struct sockaddr_in *src4, *dst4; #ifdef INET6 struct sockaddr_in6 *src6, *dst6; int error; #endif /* sa_family and sa_len must be equal */ if (src->sa_family != dst->sa_family || src->sa_len != dst->sa_len) return (EINVAL); /* validate */ switch (dst->sa_family) { case AF_INET: if (dst->sa_len != sizeof(*dst4)) return (EINVAL); src4 = (struct sockaddr_in *)src; if (in_nullhost(src4->sin_addr) || IN_MULTICAST(src4->sin_addr.s_addr)) return (EINVAL); dst4 = (struct sockaddr_in *)dst; if (in_nullhost(dst4->sin_addr) || IN_MULTICAST(dst4->sin_addr.s_addr)) return (EINVAL); sc->sc_tunnel.t_src4 = src4->sin_addr; sc->sc_tunnel.t_dst4 = dst4->sin_addr; break; #ifdef INET6 case AF_INET6: if (dst->sa_len != sizeof(*dst6)) return (EINVAL); src6 = (struct sockaddr_in6 *)src; if (IN6_IS_ADDR_UNSPECIFIED(&src6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&src6->sin6_addr)) return (EINVAL); dst6 = (struct sockaddr_in6 *)dst; if (IN6_IS_ADDR_UNSPECIFIED(&dst6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&dst6->sin6_addr)) return (EINVAL); error = in6_embedscope(&sc->sc_tunnel.t_src6, src6, NULL); if (error != 0) return (error); error = in6_embedscope(&sc->sc_tunnel.t_dst6, dst6, NULL); if (error != 0) return (error); break; #endif default: return (EAFNOSUPPORT); } /* commit */ sc->sc_tunnel.t_af = dst->sa_family; return (0); } int etherip_get_tunnel(struct etherip_softc *sc, struct if_laddrreq *req) { struct sockaddr *src = (struct sockaddr *)&req->addr; struct sockaddr *dst = (struct sockaddr *)&req->dstaddr; struct sockaddr_in *sin; #ifdef INET6 /* ifconfig already embeds the scopeid */ struct sockaddr_in6 *sin6; #endif switch (sc->sc_tunnel.t_af) { case AF_UNSPEC: return (EADDRNOTAVAIL); case AF_INET: sin = (struct sockaddr_in *)src; memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = sc->sc_tunnel.t_src4; sin = (struct sockaddr_in *)dst; memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = sc->sc_tunnel.t_dst4; break; #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)src; memset(sin6, 0, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); in6_recoverscope(sin6, &sc->sc_tunnel.t_src6); sin6 = (struct sockaddr_in6 *)dst; memset(sin6, 0, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); in6_recoverscope(sin6, &sc->sc_tunnel.t_dst6); break; #endif default: return (EAFNOSUPPORT); } return (0); } int etherip_del_tunnel(struct etherip_softc *sc) { /* commit */ sc->sc_tunnel.t_af = AF_UNSPEC; return (0); } int etherip_up(struct etherip_softc *sc) { struct ifnet *ifp = &sc->sc_ac.ac_if; NET_ASSERT_LOCKED(); SET(ifp->if_flags, IFF_RUNNING); return (0); } int etherip_down(struct etherip_softc *sc) { struct ifnet *ifp = &sc->sc_ac.ac_if; NET_ASSERT_LOCKED(); CLR(ifp->if_flags, IFF_RUNNING); return (0); } int ip_etherip_output(struct ifnet *ifp, struct mbuf *m) { struct etherip_softc *sc = (struct etherip_softc *)ifp->if_softc; struct m_tag *mtag; struct etherip_header *eip; struct ip *ip; M_PREPEND(m, sizeof(*ip) + sizeof(*eip), M_DONTWAIT); if (m == NULL) { etheripstat_inc(etherips_adrops); return ENOBUFS; } ip = mtod(m, struct ip *); memset(ip, 0, sizeof(struct ip)); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = IFQ_PRIO2TOS(sc->sc_txhprio == IF_HDRPRIO_PACKET ? m->m_pkthdr.pf.prio : sc->sc_txhprio); ip->ip_len = htons(m->m_pkthdr.len); ip->ip_id = htons(ip_randomid()); ip->ip_off = sc->sc_df; ip->ip_ttl = sc->sc_ttl; ip->ip_p = IPPROTO_ETHERIP; ip->ip_src = sc->sc_tunnel.t_src4; ip->ip_dst = sc->sc_tunnel.t_dst4; eip = (struct etherip_header *)(ip + 1); eip->eip_ver = ETHERIP_VERSION; eip->eip_res = 0; eip->eip_pad = 0; mtag = m_tag_get(PACKET_TAG_GRE, sizeof(ifp->if_index), M_NOWAIT); if (mtag == NULL) { m_freem(m); return (ENOMEM); } *(int *)(mtag + 1) = ifp->if_index; m_tag_prepend(m, mtag); m->m_flags &= ~(M_BCAST|M_MCAST); m->m_pkthdr.ph_rtableid = sc->sc_tunnel.t_rtableid; #if NPF > 0 pf_pkt_addr_changed(m); #endif etheripstat_pkt(etherips_opackets, etherips_obytes, m->m_pkthdr.len - (sizeof(struct ip) + sizeof(struct etherip_header))); ip_send(m); return (0); } int ip_etherip_input(struct mbuf **mp, int *offp, int type, int af) { struct mbuf *m = *mp; struct etherip_tunnel key; struct ip *ip; ip = mtod(m, struct ip *); key.t_af = AF_INET; key.t_src4 = ip->ip_dst; key.t_dst4 = ip->ip_src; return (etherip_input(&key, m, ip->ip_tos, *offp)); } struct etherip_softc * etherip_find(const struct etherip_tunnel *key) { struct etherip_tunnel *t; struct etherip_softc *sc; TAILQ_FOREACH(t, &etherip_list, t_entry) { if (etherip_cmp(key, t) != 0) continue; sc = (struct etherip_softc *)t; if (!ISSET(sc->sc_ac.ac_if.if_flags, IFF_RUNNING)) continue; return (sc); } return (NULL); } int etherip_input(struct etherip_tunnel *key, struct mbuf *m, uint8_t tos, int hlen) { struct etherip_softc *sc; struct ifnet *ifp; struct etherip_header *eip; int rxprio; if (!etherip_allow && (m->m_flags & (M_AUTH|M_CONF)) == 0) { etheripstat_inc(etherips_pdrops); goto drop; } key->t_rtableid = m->m_pkthdr.ph_rtableid; NET_ASSERT_LOCKED(); sc = etherip_find(key); if (sc == NULL) { etheripstat_inc(etherips_noifdrops); goto drop; } m_adj(m, hlen); m = m_pullup(m, sizeof(*eip)); if (m == NULL) { etheripstat_inc(etherips_adrops); return IPPROTO_DONE; } eip = mtod(m, struct etherip_header *); if (eip->eip_ver != ETHERIP_VERSION || eip->eip_pad) { etheripstat_inc(etherips_adrops); goto drop; } m_adj(m, sizeof(struct etherip_header)); etheripstat_pkt(etherips_ipackets, etherips_ibytes, m->m_pkthdr.len); m = m_pullup(m, sizeof(struct ether_header)); if (m == NULL) { etheripstat_inc(etherips_adrops); return IPPROTO_DONE; } rxprio = sc->sc_rxhprio; switch (rxprio) { case IF_HDRPRIO_PACKET: break; case IF_HDRPRIO_OUTER: m->m_pkthdr.pf.prio = IFQ_TOS2PRIO(tos); break; default: m->m_pkthdr.pf.prio = rxprio; break; } ifp = &sc->sc_ac.ac_if; m->m_flags &= ~(M_BCAST|M_MCAST); m->m_pkthdr.ph_ifidx = ifp->if_index; m->m_pkthdr.ph_rtableid = ifp->if_rdomain; #if NPF > 0 pf_pkt_addr_changed(m); #endif if_vinput(ifp, m); return IPPROTO_DONE; drop: m_freem(m); return (IPPROTO_DONE); } #ifdef INET6 int ip6_etherip_output(struct ifnet *ifp, struct mbuf *m) { struct etherip_softc *sc = ifp->if_softc; struct m_tag *mtag; struct ip6_hdr *ip6; struct etherip_header *eip; uint16_t len; uint32_t flow; if (IN6_IS_ADDR_UNSPECIFIED(&sc->sc_tunnel.t_dst6)) { m_freem(m); return (ENETUNREACH); } len = m->m_pkthdr.len; M_PREPEND(m, sizeof(*ip6) + sizeof(*eip), M_DONTWAIT); if (m == NULL) { etheripstat_inc(etherips_adrops); return ENOBUFS; } flow = IPV6_VERSION << 24; flow |= IFQ_PRIO2TOS(sc->sc_txhprio == IF_HDRPRIO_PACKET ? m->m_pkthdr.pf.prio : sc->sc_txhprio) << 20; ip6 = mtod(m, struct ip6_hdr *); htobem32(&ip6->ip6_flow, flow); ip6->ip6_nxt = IPPROTO_ETHERIP; ip6->ip6_hlim = sc->sc_ttl; ip6->ip6_plen = htons(len); memcpy(&ip6->ip6_src, &sc->sc_tunnel.t_src6, sizeof(ip6->ip6_src)); memcpy(&ip6->ip6_dst, &sc->sc_tunnel.t_dst6, sizeof(ip6->ip6_dst)); eip = (struct etherip_header *)(ip6 + 1); eip->eip_ver = ETHERIP_VERSION; eip->eip_res = 0; eip->eip_pad = 0; mtag = m_tag_get(PACKET_TAG_GRE, sizeof(ifp->if_index), M_NOWAIT); if (mtag == NULL) { m_freem(m); return (ENOMEM); } *(int *)(mtag + 1) = ifp->if_index; m_tag_prepend(m, mtag); if (sc->sc_df) SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT); m->m_flags &= ~(M_BCAST|M_MCAST); m->m_pkthdr.ph_rtableid = sc->sc_tunnel.t_rtableid; #if NPF > 0 pf_pkt_addr_changed(m); #endif etheripstat_pkt(etherips_opackets, etherips_obytes, len); ip6_send(m); return (0); } int ip6_etherip_input(struct mbuf **mp, int *offp, int proto, int af) { struct mbuf *m = *mp; struct etherip_tunnel key; const struct ip6_hdr *ip6; uint32_t flow; ip6 = mtod(m, const struct ip6_hdr *); key.t_af = AF_INET6; key.t_src6 = ip6->ip6_dst; key.t_dst6 = ip6->ip6_src; flow = bemtoh32(&ip6->ip6_flow); return (etherip_input(&key, m, flow >> 20, *offp)); } #endif /* INET6 */ int etherip_sysctl_etheripstat(void *oldp, size_t *oldlenp, void *newp) { struct etheripstat etheripstat; CTASSERT(sizeof(etheripstat) == (etherips_ncounters * sizeof(uint64_t))); memset(&etheripstat, 0, sizeof etheripstat); counters_read(etheripcounters, (uint64_t *)&etheripstat, etherips_ncounters); return (sysctl_rdstruct(oldp, oldlenp, newp, &etheripstat, sizeof(etheripstat))); } int etherip_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { int error; /* All sysctl names at this level are terminal. */ if (namelen != 1) return ENOTDIR; switch (name[0]) { case ETHERIPCTL_ALLOW: NET_LOCK(); error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &etherip_allow, 0, 1); NET_UNLOCK(); return (error); case ETHERIPCTL_STATS: return (etherip_sysctl_etheripstat(oldp, oldlenp, newp)); default: break; } return ENOPROTOOPT; } static inline int etherip_ip_cmp(int af, const union etherip_addr *a, const union etherip_addr *b) { switch (af) { #ifdef INET6 case AF_INET6: return (memcmp(&a->in6, &b->in6, sizeof(a->in6))); /* FALLTHROUGH */ #endif /* INET6 */ case AF_INET: return (memcmp(&a->in4, &b->in4, sizeof(a->in4))); break; default: panic("%s: unsupported af %d", __func__, af); } return (0); } static inline int etherip_cmp(const struct etherip_tunnel *a, const struct etherip_tunnel *b) { int rv; if (a->t_rtableid > b->t_rtableid) return (1); if (a->t_rtableid < b->t_rtableid) return (-1); /* sort by address */ if (a->t_af > b->t_af) return (1); if (a->t_af < b->t_af) return (-1); rv = etherip_ip_cmp(a->t_af, &a->_t_dst, &b->_t_dst); if (rv != 0) return (rv); rv = etherip_ip_cmp(a->t_af, &a->_t_src, &b->_t_src); if (rv != 0) return (rv); return (0); }
4 4 4 2 3 6 4 3 4 9 9 7 2 7 6 5 4 4 7 7 2 5 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 /* $OpenBSD: vfs_getcwd.c,v 1.37 2022/08/14 01:58:28 jsg Exp $ */ /* $NetBSD: vfs_getcwd.c,v 1.3.2.3 1999/07/11 10:24:09 sommerfeld Exp $ */ /* * Copyright (c) 1999 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Bill Sommerfeld. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/filedesc.h> #include <sys/stat.h> #include <sys/lock.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/ktrace.h> #include <sys/proc.h> #include <sys/uio.h> #include <sys/malloc.h> #include <sys/dirent.h> #include <ufs/ufs/dir.h> /* only for DIRBLKSIZ */ #include <sys/syscallargs.h> /* Find parent vnode of *lvpp, return in *uvpp */ int vfs_getcwd_scandir(struct vnode **lvpp, struct vnode **uvpp, char **bpp, char *bufp, struct proc *p) { int eofflag, tries, dirbuflen = 0, len, reclen, error = 0; off_t off; struct uio uio; struct iovec iov; char *dirbuf = NULL; ino_t fileno; struct vattr va; struct vnode *uvp = NULL; struct vnode *lvp = *lvpp; struct componentname cn; tries = 0; /* * If we want the filename, get some info we need while the * current directory is still locked. */ if (bufp != NULL) { error = VOP_GETATTR(lvp, &va, p->p_ucred, p); if (error) { vput(lvp); *lvpp = NULL; *uvpp = NULL; return (error); } } cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN | ISDOTDOT | RDONLY; cn.cn_proc = p; cn.cn_cred = p->p_ucred; cn.cn_pnbuf = NULL; cn.cn_nameptr = ".."; cn.cn_namelen = 2; cn.cn_consume = 0; /* Get parent vnode using lookup of '..' */ error = VOP_LOOKUP(lvp, uvpp, &cn); if (error) { vput(lvp); *lvpp = NULL; *uvpp = NULL; return (error); } uvp = *uvpp; /* If we don't care about the pathname, we're done */ if (bufp == NULL) { error = 0; goto out; } fileno = va.va_fileid; dirbuflen = DIRBLKSIZ; if (dirbuflen < va.va_blocksize) dirbuflen = va.va_blocksize; /* XXX we need some limit for fuse, 1 MB should be enough */ if (dirbuflen > 0xfffff) { error = EINVAL; goto out; } dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK); off = 0; do { char *cpos; struct dirent *dp; iov.iov_base = dirbuf; iov.iov_len = dirbuflen; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = off; uio.uio_resid = dirbuflen; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_procp = p; eofflag = 0; /* Call VOP_READDIR of parent */ error = VOP_READDIR(uvp, &uio, p->p_ucred, &eofflag); off = uio.uio_offset; /* Try again if NFS tosses its cookies */ if (error == EINVAL && tries < 3) { tries++; off = 0; continue; } else if (error) { goto out; /* Old userland getcwd() behaviour */ } cpos = dirbuf; tries = 0; /* Scan directory page looking for matching vnode */ for (len = (dirbuflen - uio.uio_resid); len > 0; len -= reclen) { dp = (struct dirent *)cpos; reclen = dp->d_reclen; /* Check for malformed directory */ if (reclen < DIRENT_RECSIZE(1) || reclen > len) { error = EINVAL; goto out; } if (dp->d_fileno == fileno) { char *bp = *bpp; if (offsetof(struct dirent, d_name) + dp->d_namlen > reclen) { error = EINVAL; goto out; } bp -= dp->d_namlen; if (bp <= bufp) { error = ERANGE; goto out; } memmove(bp, dp->d_name, dp->d_namlen); error = 0; *bpp = bp; goto out; } cpos += reclen; } } while (!eofflag); error = ENOENT; out: vrele(lvp); *lvpp = NULL; free(dirbuf, M_TEMP, dirbuflen); return (error); } /* Do a lookup in the vnode-to-name reverse */ int vfs_getcwd_getcache(struct vnode **lvpp, struct vnode **uvpp, char **bpp, char *bufp) { struct vnode *lvp, *uvp = NULL; char *obp; int error, vpid; lvp = *lvpp; obp = *bpp; /* Save original position to restore to on error */ error = cache_revlookup(lvp, uvpp, bpp, bufp); if (error) { if (error != -1) { vput(lvp); *lvpp = NULL; *uvpp = NULL; } return (error); } uvp = *uvpp; vpid = uvp->v_id; /* Release current lock before acquiring the parent lock */ VOP_UNLOCK(lvp); error = vget(uvp, LK_EXCLUSIVE | LK_RETRY); if (error) *uvpp = NULL; /* * Verify that vget() succeeded, and check that vnode capability * didn't change while we were waiting for the lock. */ if (error || (vpid != uvp->v_id)) { /* * Try to get our lock back. If that works, tell the caller to * try things the hard way, otherwise give up. */ if (!error) vput(uvp); *uvpp = NULL; error = vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY); if (!error) { *bpp = obp; /* restore the buffer */ return (-1); } } vrele(lvp); *lvpp = NULL; return (error); } /* Common routine shared by sys___getcwd() and vn_isunder() and sys___realpath() */ int vfs_getcwd_common(struct vnode *lvp, struct vnode *rvp, char **bpp, char *bufp, int limit, int flags, struct proc *p) { struct filedesc *fdp = p->p_fd; struct vnode *uvp = NULL; char *bp = NULL; int error, perms = VEXEC; if (rvp == NULL) { rvp = fdp->fd_rdir; if (rvp == NULL) rvp = rootvnode; } vref(rvp); vref(lvp); error = vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY); if (error) { vrele(lvp); lvp = NULL; goto out; } if (bufp) bp = *bpp; if (lvp == rvp) { if (bp) *(--bp) = '/'; goto out; } /* * This loop will terminate when we hit the root, VOP_READDIR() or * VOP_LOOKUP() fails, or we run out of space in the user buffer. */ do { if (lvp->v_type != VDIR) { error = ENOTDIR; goto out; } /* Check for access if caller cares */ if (flags & GETCWD_CHECK_ACCESS) { error = VOP_ACCESS(lvp, perms, p->p_ucred, p); if (error) goto out; perms = VEXEC|VREAD; } /* Step up if we're a covered vnode */ while (lvp->v_flag & VROOT) { struct vnode *tvp; if (lvp == rvp) goto out; tvp = lvp; lvp = lvp->v_mount->mnt_vnodecovered; vput(tvp); if (lvp == NULL) { error = ENOENT; goto out; } vref(lvp); error = vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY); if (error) { vrele(lvp); lvp = NULL; goto out; } } /* Look in the name cache */ error = vfs_getcwd_getcache(&lvp, &uvp, &bp, bufp); if (error == -1) { /* If that fails, look in the directory */ error = vfs_getcwd_scandir(&lvp, &uvp, &bp, bufp, p); } if (error) goto out; #ifdef DIAGNOSTIC if (lvp != NULL) panic("getcwd: oops, forgot to null lvp"); if (bufp && (bp <= bufp)) { panic("getcwd: oops, went back too far"); } #endif if (bp) *(--bp) = '/'; lvp = uvp; uvp = NULL; limit--; } while ((lvp != rvp) && (limit > 0)); out: if (bpp) *bpp = bp; if (uvp) vput(uvp); if (lvp) vput(lvp); vrele(rvp); return (error); } /* Find pathname of a process's current directory */ int sys___getcwd(struct proc *p, void *v, register_t *retval) { struct sys___getcwd_args *uap = v; int error, len = SCARG(uap, len); char *path, *bp; if (len > MAXPATHLEN * 4) len = MAXPATHLEN * 4; else if (len < 2) return (ERANGE); path = malloc(len, M_TEMP, M_WAITOK); bp = &path[len - 1]; *bp = '\0'; /* * 5th argument here is "max number of vnodes to traverse". * Since each entry takes up at least 2 bytes in the output * buffer, limit it to N/2 vnodes for an N byte buffer. */ error = vfs_getcwd_common(p->p_fd->fd_cdir, NULL, &bp, path, len/2, GETCWD_CHECK_ACCESS, p); if (error) goto out; /* Put the result into user buffer */ error = copyoutstr(bp, SCARG(uap, buf), MAXPATHLEN, NULL); #ifdef KTRACE if (KTRPOINT(p, KTR_NAMEI)) ktrnamei(p, bp); #endif out: free(path, M_TEMP, len); return (error); }
3 3 3 3 3 227 227 103 101 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 /* $OpenBSD: ip_ipsp.c,v 1.273 2022/08/06 15:57:59 bluhm Exp $ */ /* * The authors of this code are John Ioannidis (ji@tla.org), * Angelos D. Keromytis (kermit@csd.uch.gr), * Niels Provos (provos@physnet.uni-hamburg.de) and * Niklas Hallqvist (niklas@appli.se). * * The original version of this code was written by John Ioannidis * for BSD/OS in Athens, Greece, in November 1995. * * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996, * by Angelos D. Keromytis. * * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis * and Niels Provos. * * Additional features in 1999 by Angelos D. Keromytis and Niklas Hallqvist. * * Copyright (c) 1995, 1996, 1997, 1998, 1999 by John Ioannidis, * Angelos D. Keromytis and Niels Provos. * Copyright (c) 1999 Niklas Hallqvist. * Copyright (c) 2001, Angelos D. Keromytis. * * Permission to use, copy, and modify this software with or without fee * is hereby granted, provided that this entire notice is included in * all copies of any software which is or includes a copy or * modification of this software. * You may use this code under the GNU public license if you so wish. Please * contribute changes back to the authors under this freer than GPL license * so that we may further the use of strong encryption without limitations to * all. * * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR * PURPOSE. */ #include "pf.h" #include "pfsync.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/kernel.h> #include <sys/timeout.h> #include <sys/pool.h> #include <sys/atomic.h> #include <sys/mutex.h> #include <net/if.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> #include <netinet/ip_var.h> #include <netinet/ip_ipip.h> #if NPF > 0 #include <net/pfvar.h> #endif #if NPFSYNC > 0 #include <net/if_pfsync.h> #endif #include <netinet/ip_ipsp.h> #include <net/pfkeyv2.h> #ifdef DDB #include <ddb/db_output.h> void tdb_hashstats(void); #endif #ifdef ENCDEBUG #define DPRINTF(fmt, args...) \ do { \ if (encdebug) \ printf("%s: " fmt "\n", __func__, ## args); \ } while (0) #else #define DPRINTF(fmt, args...) \ do { } while (0) #endif /* * Locks used to protect global data and struct members: * D tdb_sadb_mtx * F ipsec_flows_mtx SA database global mutex */ struct mutex ipsec_flows_mtx = MUTEX_INITIALIZER(IPL_SOFTNET); int tdb_rehash(void); void tdb_timeout(void *); void tdb_firstuse(void *); void tdb_soft_timeout(void *); void tdb_soft_firstuse(void *); int tdb_hash(u_int32_t, union sockaddr_union *, u_int8_t); int ipsec_in_use = 0; u_int64_t ipsec_last_added = 0; int ipsec_ids_idle = 100; /* keep free ids for 100s */ struct pool tdb_pool; /* Protected by the NET_LOCK(). */ u_int32_t ipsec_ids_next_flow = 1; /* [F] may not be zero */ struct ipsec_ids_tree ipsec_ids_tree; /* [F] */ struct ipsec_ids_flows ipsec_ids_flows; /* [F] */ struct ipsec_policy_head ipsec_policy_head = TAILQ_HEAD_INITIALIZER(ipsec_policy_head); void ipsp_ids_gc(void *); LIST_HEAD(, ipsec_ids) ipsp_ids_gc_list = LIST_HEAD_INITIALIZER(ipsp_ids_gc_list); /* [F] */ struct timeout ipsp_ids_gc_timeout = TIMEOUT_INITIALIZER_FLAGS(ipsp_ids_gc, NULL, TIMEOUT_PROC); static inline int ipsp_ids_cmp(const struct ipsec_ids *, const struct ipsec_ids *); static inline int ipsp_ids_flow_cmp(const struct ipsec_ids *, const struct ipsec_ids *); RBT_PROTOTYPE(ipsec_ids_tree, ipsec_ids, id_node_flow, ipsp_ids_cmp); RBT_PROTOTYPE(ipsec_ids_flows, ipsec_ids, id_node_id, ipsp_ids_flow_cmp); RBT_GENERATE(ipsec_ids_tree, ipsec_ids, id_node_flow, ipsp_ids_cmp); RBT_GENERATE(ipsec_ids_flows, ipsec_ids, id_node_id, ipsp_ids_flow_cmp); /* * This is the proper place to define the various encapsulation transforms. */ const struct xformsw xformsw[] = { #ifdef IPSEC { .xf_type = XF_IP4, .xf_flags = 0, .xf_name = "IPv4 Simple Encapsulation", .xf_attach = ipe4_attach, .xf_init = ipe4_init, .xf_zeroize = ipe4_zeroize, .xf_input = ipe4_input, .xf_output = NULL, }, { .xf_type = XF_AH, .xf_flags = XFT_AUTH, .xf_name = "IPsec AH", .xf_attach = ah_attach, .xf_init = ah_init, .xf_zeroize = ah_zeroize, .xf_input = ah_input, .xf_output = ah_output, }, { .xf_type = XF_ESP, .xf_flags = XFT_CONF|XFT_AUTH, .xf_name = "IPsec ESP", .xf_attach = esp_attach, .xf_init = esp_init, .xf_zeroize = esp_zeroize, .xf_input = esp_input, .xf_output = esp_output, }, { .xf_type = XF_IPCOMP, .xf_flags = XFT_COMP, .xf_name = "IPcomp", .xf_attach = ipcomp_attach, .xf_init = ipcomp_init, .xf_zeroize = ipcomp_zeroize, .xf_input = ipcomp_input, .xf_output = ipcomp_output, }, #endif /* IPSEC */ #ifdef TCP_SIGNATURE { .xf_type = XF_TCPSIGNATURE, .xf_flags = XFT_AUTH, .xf_name = "TCP MD5 Signature Option, RFC 2385", .xf_attach = tcp_signature_tdb_attach, .xf_init = tcp_signature_tdb_init, .xf_zeroize = tcp_signature_tdb_zeroize, .xf_input = tcp_signature_tdb_input, .xf_output = tcp_signature_tdb_output, } #endif /* TCP_SIGNATURE */ }; const struct xformsw *const xformswNXFORMSW = &xformsw[nitems(xformsw)]; #define TDB_HASHSIZE_INIT 32 struct mutex tdb_sadb_mtx = MUTEX_INITIALIZER(IPL_SOFTNET); static SIPHASH_KEY tdbkey; /* [D] */ static struct tdb **tdbh; /* [D] */ static struct tdb **tdbdst; /* [D] */ static struct tdb **tdbsrc; /* [D] */ static u_int tdb_hashmask = TDB_HASHSIZE_INIT - 1; /* [D] */ static int tdb_count; /* [D] */ void ipsp_init(void) { pool_init(&tdb_pool, sizeof(struct tdb), 0, IPL_SOFTNET, 0, "tdb", NULL); arc4random_buf(&tdbkey, sizeof(tdbkey)); tdbh = mallocarray(tdb_hashmask + 1, sizeof(struct tdb *), M_TDB, M_WAITOK | M_ZERO); tdbdst = mallocarray(tdb_hashmask + 1, sizeof(struct tdb *), M_TDB, M_WAITOK | M_ZERO); tdbsrc = mallocarray(tdb_hashmask + 1, sizeof(struct tdb *), M_TDB, M_WAITOK | M_ZERO); } /* * Our hashing function needs to stir things with a non-zero random multiplier * so we cannot be DoS-attacked via choosing of the data to hash. */ int tdb_hash(u_int32_t spi, union sockaddr_union *dst, u_int8_t proto) { SIPHASH_CTX ctx; MUTEX_ASSERT_LOCKED(&tdb_sadb_mtx); SipHash24_Init(&ctx, &tdbkey); SipHash24_Update(&ctx, &spi, sizeof(spi)); SipHash24_Update(&ctx, &proto, sizeof(proto)); SipHash24_Update(&ctx, dst, dst->sa.sa_len); return (SipHash24_End(&ctx) & tdb_hashmask); } /* * Reserve an SPI; the SA is not valid yet though. We use 0 as * an error return value. */ u_int32_t reserve_spi(u_int rdomain, u_int32_t sspi, u_int32_t tspi, union sockaddr_union *src, union sockaddr_union *dst, u_int8_t sproto, int *errval) { struct tdb *tdbp, *exists; u_int32_t spi; int nums; /* Don't accept ranges only encompassing reserved SPIs. */ if (sproto != IPPROTO_IPCOMP && (tspi < sspi || tspi <= SPI_RESERVED_MAX)) { (*errval) = EINVAL; return 0; } if (sproto == IPPROTO_IPCOMP && (tspi < sspi || tspi <= CPI_RESERVED_MAX || tspi >= CPI_PRIVATE_MIN)) { (*errval) = EINVAL; return 0; } /* Limit the range to not include reserved areas. */ if (sspi <= SPI_RESERVED_MAX) sspi = SPI_RESERVED_MAX + 1; /* For IPCOMP the CPI is only 16 bits long, what a good idea.... */ if (sproto == IPPROTO_IPCOMP) { u_int32_t t; if (sspi >= 0x10000) sspi = 0xffff; if (tspi >= 0x10000) tspi = 0xffff; if (sspi > tspi) { t = sspi; sspi = tspi; tspi = t; } } if (sspi == tspi) /* Asking for a specific SPI. */ nums = 1; else nums = 100; /* Arbitrarily chosen */ /* allocate ahead of time to avoid potential sleeping race in loop */ tdbp = tdb_alloc(rdomain); while (nums--) { if (sspi == tspi) /* Specific SPI asked. */ spi = tspi; else /* Range specified */ spi = sspi + arc4random_uniform(tspi - sspi); /* Don't allocate reserved SPIs. */ if (spi >= SPI_RESERVED_MIN && spi <= SPI_RESERVED_MAX) continue; else spi = htonl(spi); /* Check whether we're using this SPI already. */ exists = gettdb(rdomain, spi, dst, sproto); if (exists != NULL) { tdb_unref(exists); continue; } tdbp->tdb_spi = spi; memcpy(&tdbp->tdb_dst.sa, &dst->sa, dst->sa.sa_len); memcpy(&tdbp->tdb_src.sa, &src->sa, src->sa.sa_len); tdbp->tdb_sproto = sproto; tdbp->tdb_flags |= TDBF_INVALID; /* Mark SA invalid for now. */ tdbp->tdb_satype = SADB_SATYPE_UNSPEC; puttdb(tdbp); #ifdef IPSEC /* Setup a "silent" expiration (since TDBF_INVALID's set). */ if (ipsec_keep_invalid > 0) { mtx_enter(&tdbp->tdb_mtx); tdbp->tdb_flags |= TDBF_TIMER; tdbp->tdb_exp_timeout = ipsec_keep_invalid; if (timeout_add_sec(&tdbp->tdb_timer_tmo, ipsec_keep_invalid)) tdb_ref(tdbp); mtx_leave(&tdbp->tdb_mtx); } #endif return spi; } (*errval) = EEXIST; tdb_unref(tdbp); return 0; } /* * An IPSP SAID is really the concatenation of the SPI found in the * packet, the destination address of the packet and the IPsec protocol. * When we receive an IPSP packet, we need to look up its tunnel descriptor * block, based on the SPI in the packet and the destination address (which * is really one of our addresses if we received the packet! */ struct tdb * gettdb_dir(u_int rdomain, u_int32_t spi, union sockaddr_union *dst, u_int8_t proto, int reverse) { u_int32_t hashval; struct tdb *tdbp; NET_ASSERT_LOCKED(); mtx_enter(&tdb_sadb_mtx); hashval = tdb_hash(spi, dst, proto); for (tdbp = tdbh[hashval]; tdbp != NULL; tdbp = tdbp->tdb_hnext) if ((tdbp->tdb_spi == spi) && (tdbp->tdb_sproto == proto) && ((!reverse && tdbp->tdb_rdomain == rdomain) || (reverse && tdbp->tdb_rdomain_post == rdomain)) && !memcmp(&tdbp->tdb_dst, dst, dst->sa.sa_len)) break; tdb_ref(tdbp); mtx_leave(&tdb_sadb_mtx); return tdbp; } /* * Same as gettdb() but compare SRC as well, so we * use the tdbsrc[] hash table. Setting spi to 0 * matches all SPIs. */ struct tdb * gettdbbysrcdst_dir(u_int rdomain, u_int32_t spi, union sockaddr_union *src, union sockaddr_union *dst, u_int8_t proto, int reverse) { u_int32_t hashval; struct tdb *tdbp; union sockaddr_union su_null; mtx_enter(&tdb_sadb_mtx); hashval = tdb_hash(0, src, proto); for (tdbp = tdbsrc[hashval]; tdbp != NULL; tdbp = tdbp->tdb_snext) { if (tdbp->tdb_sproto == proto && (spi == 0 || tdbp->tdb_spi == spi) && ((!reverse && tdbp->tdb_rdomain == rdomain) || (reverse && tdbp->tdb_rdomain_post == rdomain)) && ((tdbp->tdb_flags & TDBF_INVALID) == 0) && (tdbp->tdb_dst.sa.sa_family == AF_UNSPEC || !memcmp(&tdbp->tdb_dst, dst, dst->sa.sa_len)) && !memcmp(&tdbp->tdb_src, src, src->sa.sa_len)) break; } if (tdbp != NULL) { tdb_ref(tdbp); mtx_leave(&tdb_sadb_mtx); return tdbp; } memset(&su_null, 0, sizeof(su_null)); su_null.sa.sa_len = sizeof(struct sockaddr); hashval = tdb_hash(0, &su_null, proto); for (tdbp = tdbsrc[hashval]; tdbp != NULL; tdbp = tdbp->tdb_snext) { if (tdbp->tdb_sproto == proto && (spi == 0 || tdbp->tdb_spi == spi) && ((!reverse && tdbp->tdb_rdomain == rdomain) || (reverse && tdbp->tdb_rdomain_post == rdomain)) && ((tdbp->tdb_flags & TDBF_INVALID) == 0) && (tdbp->tdb_dst.sa.sa_family == AF_UNSPEC || !memcmp(&tdbp->tdb_dst, dst, dst->sa.sa_len)) && tdbp->tdb_src.sa.sa_family == AF_UNSPEC) break; } tdb_ref(tdbp); mtx_leave(&tdb_sadb_mtx); return tdbp; } /* * Check that IDs match. Return true if so. The t* range of * arguments contains information from TDBs; the p* range of * arguments contains information from policies or already * established TDBs. */ int ipsp_aux_match(struct tdb *tdb, struct ipsec_ids *ids, struct sockaddr_encap *pfilter, struct sockaddr_encap *pfiltermask) { if (ids != NULL) if (tdb->tdb_ids == NULL || !ipsp_ids_match(tdb->tdb_ids, ids)) return 0; /* Check for filter matches. */ if (pfilter != NULL && pfiltermask != NULL && tdb->tdb_filter.sen_type) { /* * XXX We should really be doing a subnet-check (see * whether the TDB-associated filter is a subset * of the policy's. For now, an exact match will solve * most problems (all this will do is make every * policy get its own SAs). */ if (memcmp(&tdb->tdb_filter, pfilter, sizeof(struct sockaddr_encap)) || memcmp(&tdb->tdb_filtermask, pfiltermask, sizeof(struct sockaddr_encap))) return 0; } return 1; } /* * Get an SA given the remote address, the security protocol type, and * the desired IDs. */ struct tdb * gettdbbydst(u_int rdomain, union sockaddr_union *dst, u_int8_t sproto, struct ipsec_ids *ids, struct sockaddr_encap *filter, struct sockaddr_encap *filtermask) { u_int32_t hashval; struct tdb *tdbp; mtx_enter(&tdb_sadb_mtx); hashval = tdb_hash(0, dst, sproto); for (tdbp = tdbdst[hashval]; tdbp != NULL; tdbp = tdbp->tdb_dnext) if ((tdbp->tdb_sproto == sproto) && (tdbp->tdb_rdomain == rdomain) && ((tdbp->tdb_flags & TDBF_INVALID) == 0) && (!memcmp(&tdbp->tdb_dst, dst, dst->sa.sa_len))) { /* Check whether IDs match */ if (!ipsp_aux_match(tdbp, ids, filter, filtermask)) continue; break; } tdb_ref(tdbp); mtx_leave(&tdb_sadb_mtx); return tdbp; } /* * Get an SA given the source address, the security protocol type, and * the desired IDs. */ struct tdb * gettdbbysrc(u_int rdomain, union sockaddr_union *src, u_int8_t sproto, struct ipsec_ids *ids, struct sockaddr_encap *filter, struct sockaddr_encap *filtermask) { u_int32_t hashval; struct tdb *tdbp; mtx_enter(&tdb_sadb_mtx); hashval = tdb_hash(0, src, sproto); for (tdbp = tdbsrc[hashval]; tdbp != NULL; tdbp = tdbp->tdb_snext) { if ((tdbp->tdb_sproto == sproto) && (tdbp->tdb_rdomain == rdomain) && ((tdbp->tdb_flags & TDBF_INVALID) == 0) && (!memcmp(&tdbp->tdb_src, src, src->sa.sa_len))) { /* Check whether IDs match */ if (!ipsp_aux_match(tdbp, ids, filter, filtermask)) continue; break; } } tdb_ref(tdbp); mtx_leave(&tdb_sadb_mtx); return tdbp; } #ifdef DDB #define NBUCKETS 16 void tdb_hashstats(void) { int i, cnt, buckets[NBUCKETS]; struct tdb *tdbp; if (tdbh == NULL) { db_printf("no tdb hash table\n"); return; } memset(buckets, 0, sizeof(buckets)); for (i = 0; i <= tdb_hashmask; i++) { cnt = 0; for (tdbp = tdbh[i]; cnt < NBUCKETS - 1 && tdbp != NULL; tdbp = tdbp->tdb_hnext) cnt++; buckets[cnt]++; } db_printf("tdb cnt\t\tbucket cnt\n"); for (i = 0; i < NBUCKETS; i++) if (buckets[i] > 0) db_printf("%d%s\t\t%d\n", i, i == NBUCKETS - 1 ? "+" : "", buckets[i]); } #define DUMP(m, f) pr("%18s: " f "\n", #m, tdb->tdb_##m) void tdb_printit(void *addr, int full, int (*pr)(const char *, ...)) { struct tdb *tdb = addr; char buf[INET6_ADDRSTRLEN]; if (full) { pr("tdb at %p\n", tdb); DUMP(hnext, "%p"); DUMP(dnext, "%p"); DUMP(snext, "%p"); DUMP(inext, "%p"); DUMP(onext, "%p"); DUMP(xform, "%p"); pr("%18s: %d\n", "refcnt", tdb->tdb_refcnt.r_refs); DUMP(encalgxform, "%p"); DUMP(authalgxform, "%p"); DUMP(compalgxform, "%p"); pr("%18s: %b\n", "flags", tdb->tdb_flags, TDBF_BITS); /* tdb_XXX_tmo */ DUMP(seq, "%d"); DUMP(exp_allocations, "%d"); DUMP(soft_allocations, "%d"); DUMP(cur_allocations, "%d"); DUMP(exp_bytes, "%lld"); DUMP(soft_bytes, "%lld"); DUMP(cur_bytes, "%lld"); DUMP(exp_timeout, "%lld"); DUMP(soft_timeout, "%lld"); DUMP(established, "%lld"); DUMP(first_use, "%lld"); DUMP(soft_first_use, "%lld"); DUMP(exp_first_use, "%lld"); DUMP(last_used, "%lld"); DUMP(last_marked, "%lld"); /* tdb_data */ DUMP(cryptoid, "%lld"); pr("%18s: %08x\n", "tdb_spi", ntohl(tdb->tdb_spi)); DUMP(amxkeylen, "%d"); DUMP(emxkeylen, "%d"); DUMP(ivlen, "%d"); DUMP(sproto, "%d"); DUMP(wnd, "%d"); DUMP(satype, "%d"); DUMP(updates, "%d"); pr("%18s: %s\n", "dst", ipsp_address(&tdb->tdb_dst, buf, sizeof(buf))); pr("%18s: %s\n", "src", ipsp_address(&tdb->tdb_src, buf, sizeof(buf))); DUMP(amxkey, "%p"); DUMP(emxkey, "%p"); DUMP(rpl, "%lld"); /* tdb_seen */ /* tdb_iv */ DUMP(ids, "%p"); DUMP(ids_swapped, "%d"); DUMP(mtu, "%d"); DUMP(mtutimeout, "%lld"); pr("%18s: %d\n", "udpencap_port", ntohs(tdb->tdb_udpencap_port)); DUMP(tag, "%d"); DUMP(tap, "%d"); DUMP(rdomain, "%d"); DUMP(rdomain_post, "%d"); /* tdb_filter */ /* tdb_filtermask */ /* tdb_policy_head */ /* tdb_sync_entry */ } else { pr("%p:", tdb); pr(" %08x", ntohl(tdb->tdb_spi)); pr(" %s", ipsp_address(&tdb->tdb_src, buf, sizeof(buf))); pr("->%s", ipsp_address(&tdb->tdb_dst, buf, sizeof(buf))); pr(":%d", tdb->tdb_sproto); pr(" #%d", tdb->tdb_refcnt.r_refs); pr(" %08x\n", tdb->tdb_flags); } } #undef DUMP #endif /* DDB */ int tdb_walk(u_int rdomain, int (*walker)(struct tdb *, void *, int), void *arg) { SIMPLEQ_HEAD(, tdb) tdblist; struct tdb *tdbp; int i, rval; /* * The walker may sleep. So we cannot hold the tdb_sadb_mtx while * traversing the tdb_hnext list. Create a new tdb_walk list with * exclusive netlock protection. */ NET_ASSERT_LOCKED_EXCLUSIVE(); SIMPLEQ_INIT(&tdblist); mtx_enter(&tdb_sadb_mtx); for (i = 0; i <= tdb_hashmask; i++) { for (tdbp = tdbh[i]; tdbp != NULL; tdbp = tdbp->tdb_hnext) { if (rdomain != tdbp->tdb_rdomain) continue; tdb_ref(tdbp); SIMPLEQ_INSERT_TAIL(&tdblist, tdbp, tdb_walk); } } mtx_leave(&tdb_sadb_mtx); rval = 0; while ((tdbp = SIMPLEQ_FIRST(&tdblist)) != NULL) { SIMPLEQ_REMOVE_HEAD(&tdblist, tdb_walk); if (rval == 0) rval = walker(tdbp, arg, SIMPLEQ_EMPTY(&tdblist)); tdb_unref(tdbp); } return rval; } void tdb_timeout(void *v) { struct tdb *tdb = v; NET_LOCK(); if (tdb->tdb_flags & TDBF_TIMER) { /* If it's an "invalid" TDB do a silent expiration. */ if (!(tdb->tdb_flags & TDBF_INVALID)) { #ifdef IPSEC ipsecstat_inc(ipsec_exctdb); #endif /* IPSEC */ pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_HARD); } tdb_delete(tdb); } /* decrement refcount of the timeout argument */ tdb_unref(tdb); NET_UNLOCK(); } void tdb_firstuse(void *v) { struct tdb *tdb = v; NET_LOCK(); if (tdb->tdb_flags & TDBF_SOFT_FIRSTUSE) { /* If the TDB hasn't been used, don't renew it. */ if (tdb->tdb_first_use != 0) { #ifdef IPSEC ipsecstat_inc(ipsec_exctdb); #endif /* IPSEC */ pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_HARD); } tdb_delete(tdb); } /* decrement refcount of the timeout argument */ tdb_unref(tdb); NET_UNLOCK(); } void tdb_soft_timeout(void *v) { struct tdb *tdb = v; NET_LOCK(); mtx_enter(&tdb->tdb_mtx); if (tdb->tdb_flags & TDBF_SOFT_TIMER) { tdb->tdb_flags &= ~TDBF_SOFT_TIMER; mtx_leave(&tdb->tdb_mtx); /* Soft expirations. */ pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_SOFT); } else mtx_leave(&tdb->tdb_mtx); /* decrement refcount of the timeout argument */ tdb_unref(tdb); NET_UNLOCK(); } void tdb_soft_firstuse(void *v) { struct tdb *tdb = v; NET_LOCK(); mtx_enter(&tdb->tdb_mtx); if (tdb->tdb_flags & TDBF_SOFT_FIRSTUSE) { tdb->tdb_flags &= ~TDBF_SOFT_FIRSTUSE; mtx_leave(&tdb->tdb_mtx); /* If the TDB hasn't been used, don't renew it. */ if (tdb->tdb_first_use != 0) pfkeyv2_expire(tdb, SADB_EXT_LIFETIME_SOFT); } else mtx_leave(&tdb->tdb_mtx); /* decrement refcount of the timeout argument */ tdb_unref(tdb); NET_UNLOCK(); } int tdb_rehash(void) { struct tdb **new_tdbh, **new_tdbdst, **new_srcaddr, *tdbp, *tdbnp; u_int i, old_hashmask; u_int32_t hashval; MUTEX_ASSERT_LOCKED(&tdb_sadb_mtx); old_hashmask = tdb_hashmask; tdb_hashmask = (tdb_hashmask << 1) | 1; arc4random_buf(&tdbkey, sizeof(tdbkey)); new_tdbh = mallocarray(tdb_hashmask + 1, sizeof(struct tdb *), M_TDB, M_NOWAIT | M_ZERO); new_tdbdst = mallocarray(tdb_hashmask + 1, sizeof(struct tdb *), M_TDB, M_NOWAIT | M_ZERO); new_srcaddr = mallocarray(tdb_hashmask + 1, sizeof(struct tdb *), M_TDB, M_NOWAIT | M_ZERO); if (new_tdbh == NULL || new_tdbdst == NULL || new_srcaddr == NULL) { free(new_tdbh, M_TDB, 0); free(new_tdbdst, M_TDB, 0); free(new_srcaddr, M_TDB, 0); return (ENOMEM); } for (i = 0; i <= old_hashmask; i++) { for (tdbp = tdbh[i]; tdbp != NULL; tdbp = tdbnp) { tdbnp = tdbp->tdb_hnext; hashval = tdb_hash(tdbp->tdb_spi, &tdbp->tdb_dst, tdbp->tdb_sproto); tdbp->tdb_hnext = new_tdbh[hashval]; new_tdbh[hashval] = tdbp; } for (tdbp = tdbdst[i]; tdbp != NULL; tdbp = tdbnp) { tdbnp = tdbp->tdb_dnext; hashval = tdb_hash(0, &tdbp->tdb_dst, tdbp->tdb_sproto); tdbp->tdb_dnext = new_tdbdst[hashval]; new_tdbdst[hashval] = tdbp; } for (tdbp = tdbsrc[i]; tdbp != NULL; tdbp = tdbnp) { tdbnp = tdbp->tdb_snext; hashval = tdb_hash(0, &tdbp->tdb_src, tdbp->tdb_sproto); tdbp->tdb_snext = new_srcaddr[hashval]; new_srcaddr[hashval] = tdbp; } } free(tdbh, M_TDB, 0); tdbh = new_tdbh; free(tdbdst, M_TDB, 0); tdbdst = new_tdbdst; free(tdbsrc, M_TDB, 0); tdbsrc = new_srcaddr; return 0; } /* * Add TDB in the hash table. */ void puttdb(struct tdb *tdbp) { mtx_enter(&tdb_sadb_mtx); puttdb_locked(tdbp); mtx_leave(&tdb_sadb_mtx); } void puttdb_locked(struct tdb *tdbp) { u_int32_t hashval; MUTEX_ASSERT_LOCKED(&tdb_sadb_mtx); hashval = tdb_hash(tdbp->tdb_spi, &tdbp->tdb_dst, tdbp->tdb_sproto); /* * Rehash if this tdb would cause a bucket to have more than * two items and if the number of tdbs exceed 10% of the * bucket count. This number is arbitrarily chosen and is * just a measure to not keep rehashing when adding and * removing tdbs which happens to always end up in the same * bucket, which is not uncommon when doing manual keying. */ if (tdbh[hashval] != NULL && tdbh[hashval]->tdb_hnext != NULL && tdb_count * 10 > tdb_hashmask + 1) { if (tdb_rehash() == 0) hashval = tdb_hash(tdbp->tdb_spi, &tdbp->tdb_dst, tdbp->tdb_sproto); } tdbp->tdb_hnext = tdbh[hashval]; tdbh[hashval] = tdbp; hashval = tdb_hash(0, &tdbp->tdb_dst, tdbp->tdb_sproto); tdbp->tdb_dnext = tdbdst[hashval]; tdbdst[hashval] = tdbp; hashval = tdb_hash(0, &tdbp->tdb_src, tdbp->tdb_sproto); tdbp->tdb_snext = tdbsrc[hashval]; tdbsrc[hashval] = tdbp; tdb_count++; #ifdef IPSEC if ((tdbp->tdb_flags & (TDBF_INVALID|TDBF_TUNNELING)) == TDBF_TUNNELING) ipsecstat_inc(ipsec_tunnels); #endif /* IPSEC */ ipsec_last_added = getuptime(); } void tdb_unlink(struct tdb *tdbp) { mtx_enter(&tdb_sadb_mtx); tdb_unlink_locked(tdbp); mtx_leave(&tdb_sadb_mtx); } void tdb_unlink_locked(struct tdb *tdbp) { struct tdb *tdbpp; u_int32_t hashval; MUTEX_ASSERT_LOCKED(&tdb_sadb_mtx); hashval = tdb_hash(tdbp->tdb_spi, &tdbp->tdb_dst, tdbp->tdb_sproto); if (tdbh[hashval] == tdbp) { tdbh[hashval] = tdbp->tdb_hnext; } else { for (tdbpp = tdbh[hashval]; tdbpp != NULL; tdbpp = tdbpp->tdb_hnext) { if (tdbpp->tdb_hnext == tdbp) { tdbpp->tdb_hnext = tdbp->tdb_hnext; break; } } } tdbp->tdb_hnext = NULL; hashval = tdb_hash(0, &tdbp->tdb_dst, tdbp->tdb_sproto); if (tdbdst[hashval] == tdbp) { tdbdst[hashval] = tdbp->tdb_dnext; } else { for (tdbpp = tdbdst[hashval]; tdbpp != NULL; tdbpp = tdbpp->tdb_dnext) { if (tdbpp->tdb_dnext == tdbp) { tdbpp->tdb_dnext = tdbp->tdb_dnext; break; } } } tdbp->tdb_dnext = NULL; hashval = tdb_hash(0, &tdbp->tdb_src, tdbp->tdb_sproto); if (tdbsrc[hashval] == tdbp) { tdbsrc[hashval] = tdbp->tdb_snext; } else { for (tdbpp = tdbsrc[hashval]; tdbpp != NULL; tdbpp = tdbpp->tdb_snext) { if (tdbpp->tdb_snext == tdbp) { tdbpp->tdb_snext = tdbp->tdb_snext; break; } } } tdbp->tdb_snext = NULL; tdb_count--; #ifdef IPSEC if ((tdbp->tdb_flags & (TDBF_INVALID|TDBF_TUNNELING)) == TDBF_TUNNELING) { ipsecstat_dec(ipsec_tunnels); ipsecstat_inc(ipsec_prevtunnels); } #endif /* IPSEC */ } void tdb_cleanspd(struct tdb *tdbp) { struct ipsec_policy *ipo; mtx_enter(&ipo_tdb_mtx); while ((ipo = TAILQ_FIRST(&tdbp->tdb_policy_head)) != NULL) { TAILQ_REMOVE(&tdbp->tdb_policy_head, ipo, ipo_tdb_next); tdb_unref(ipo->ipo_tdb); ipo->ipo_tdb = NULL; ipo->ipo_last_searched = 0; /* Force a re-search. */ } mtx_leave(&ipo_tdb_mtx); } void tdb_unbundle(struct tdb *tdbp) { if (tdbp->tdb_onext != NULL) { if (tdbp->tdb_onext->tdb_inext == tdbp) { tdb_unref(tdbp); /* to us */ tdbp->tdb_onext->tdb_inext = NULL; } tdb_unref(tdbp->tdb_onext); /* to other */ tdbp->tdb_onext = NULL; } if (tdbp->tdb_inext != NULL) { if (tdbp->tdb_inext->tdb_onext == tdbp) { tdb_unref(tdbp); /* to us */ tdbp->tdb_inext->tdb_onext = NULL; } tdb_unref(tdbp->tdb_inext); /* to other */ tdbp->tdb_inext = NULL; } } void tdb_deltimeouts(struct tdb *tdbp) { mtx_enter(&tdbp->tdb_mtx); tdbp->tdb_flags &= ~(TDBF_FIRSTUSE | TDBF_SOFT_FIRSTUSE | TDBF_TIMER | TDBF_SOFT_TIMER); if (timeout_del(&tdbp->tdb_timer_tmo)) tdb_unref(tdbp); if (timeout_del(&tdbp->tdb_first_tmo)) tdb_unref(tdbp); if (timeout_del(&tdbp->tdb_stimer_tmo)) tdb_unref(tdbp); if (timeout_del(&tdbp->tdb_sfirst_tmo)) tdb_unref(tdbp); mtx_leave(&tdbp->tdb_mtx); } struct tdb * tdb_ref(struct tdb *tdb) { if (tdb == NULL) return NULL; refcnt_take(&tdb->tdb_refcnt); return tdb; } void tdb_unref(struct tdb *tdb) { if (tdb == NULL) return; if (refcnt_rele(&tdb->tdb_refcnt) == 0) return; tdb_free(tdb); } void tdb_delete(struct tdb *tdbp) { NET_ASSERT_LOCKED(); mtx_enter(&tdbp->tdb_mtx); if (tdbp->tdb_flags & TDBF_DELETED) { mtx_leave(&tdbp->tdb_mtx); return; } tdbp->tdb_flags |= TDBF_DELETED; mtx_leave(&tdbp->tdb_mtx); tdb_unlink(tdbp); /* cleanup SPD references */ tdb_cleanspd(tdbp); /* release tdb_onext/tdb_inext references */ tdb_unbundle(tdbp); /* delete timeouts and release references */ tdb_deltimeouts(tdbp); /* release the reference for tdb_unlink() */ tdb_unref(tdbp); } /* * Allocate a TDB and initialize a few basic fields. */ struct tdb * tdb_alloc(u_int rdomain) { struct tdb *tdbp; tdbp = pool_get(&tdb_pool, PR_WAITOK | PR_ZERO); refcnt_init_trace(&tdbp->tdb_refcnt, DT_REFCNT_IDX_TDB); mtx_init(&tdbp->tdb_mtx, IPL_SOFTNET); TAILQ_INIT(&tdbp->tdb_policy_head); /* Record establishment time. */ tdbp->tdb_established = gettime(); /* Save routing domain */ tdbp->tdb_rdomain = rdomain; tdbp->tdb_rdomain_post = rdomain; /* Initialize counters. */ tdbp->tdb_counters = counters_alloc(tdb_ncounters); /* Initialize timeouts. */ timeout_set_proc(&tdbp->tdb_timer_tmo, tdb_timeout, tdbp); timeout_set_proc(&tdbp->tdb_first_tmo, tdb_firstuse, tdbp); timeout_set_proc(&tdbp->tdb_stimer_tmo, tdb_soft_timeout, tdbp); timeout_set_proc(&tdbp->tdb_sfirst_tmo, tdb_soft_firstuse, tdbp); return tdbp; } void tdb_free(struct tdb *tdbp) { NET_ASSERT_LOCKED(); if (tdbp->tdb_xform) { (*(tdbp->tdb_xform->xf_zeroize))(tdbp); tdbp->tdb_xform = NULL; } #if NPFSYNC > 0 /* Cleanup pfsync references */ pfsync_delete_tdb(tdbp); #endif KASSERT(TAILQ_EMPTY(&tdbp->tdb_policy_head)); if (tdbp->tdb_ids) { ipsp_ids_free(tdbp->tdb_ids); tdbp->tdb_ids = NULL; } #if NPF > 0 if (tdbp->tdb_tag) { pf_tag_unref(tdbp->tdb_tag); tdbp->tdb_tag = 0; } #endif counters_free(tdbp->tdb_counters, tdb_ncounters); KASSERT(tdbp->tdb_onext == NULL); KASSERT(tdbp->tdb_inext == NULL); /* Remove expiration timeouts. */ KASSERT(timeout_pending(&tdbp->tdb_timer_tmo) == 0); KASSERT(timeout_pending(&tdbp->tdb_first_tmo) == 0); KASSERT(timeout_pending(&tdbp->tdb_stimer_tmo) == 0); KASSERT(timeout_pending(&tdbp->tdb_sfirst_tmo) == 0); pool_put(&tdb_pool, tdbp); } /* * Do further initializations of a TDB. */ int tdb_init(struct tdb *tdbp, u_int16_t alg, struct ipsecinit *ii) { const struct xformsw *xsp; int err; #ifdef ENCDEBUG char buf[INET6_ADDRSTRLEN]; #endif for (xsp = xformsw; xsp < xformswNXFORMSW; xsp++) { if (xsp->xf_type == alg) { err = (*(xsp->xf_init))(tdbp, xsp, ii); return err; } } DPRINTF("no alg %d for spi %08x, addr %s, proto %d", alg, ntohl(tdbp->tdb_spi), ipsp_address(&tdbp->tdb_dst, buf, sizeof(buf)), tdbp->tdb_sproto); return EINVAL; } #if defined(DDB) || defined(ENCDEBUG) /* Return a printable string for the address. */ const char * ipsp_address(union sockaddr_union *sa, char *buf, socklen_t size) { switch (sa->sa.sa_family) { case AF_INET: return inet_ntop(AF_INET, &sa->sin.sin_addr, buf, (size_t)size); #ifdef INET6 case AF_INET6: return inet_ntop(AF_INET6, &sa->sin6.sin6_addr, buf, (size_t)size); #endif /* INET6 */ default: return "(unknown address family)"; } } #endif /* DDB || ENCDEBUG */ /* Check whether an IP{4,6} address is unspecified. */ int ipsp_is_unspecified(union sockaddr_union addr) { switch (addr.sa.sa_family) { case AF_INET: if (addr.sin.sin_addr.s_addr == INADDR_ANY) return 1; else return 0; #ifdef INET6 case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&addr.sin6.sin6_addr)) return 1; else return 0; #endif /* INET6 */ case 0: /* No family set. */ default: return 1; } } int ipsp_ids_match(struct ipsec_ids *a, struct ipsec_ids *b) { return a == b; } struct ipsec_ids * ipsp_ids_insert(struct ipsec_ids *ids) { struct ipsec_ids *found; u_int32_t start_flow; mtx_enter(&ipsec_flows_mtx); found = RBT_INSERT(ipsec_ids_tree, &ipsec_ids_tree, ids); if (found) { /* if refcount was zero, then timeout is running */ if ((++found->id_refcount) == 1) { LIST_REMOVE(found, id_gc_list); if (LIST_EMPTY(&ipsp_ids_gc_list)) timeout_del(&ipsp_ids_gc_timeout); } mtx_leave (&ipsec_flows_mtx); DPRINTF("ids %p count %d", found, found->id_refcount); return found; } ids->id_refcount = 1; ids->id_flow = start_flow = ipsec_ids_next_flow; if (++ipsec_ids_next_flow == 0) ipsec_ids_next_flow = 1; while (RBT_INSERT(ipsec_ids_flows, &ipsec_ids_flows, ids) != NULL) { ids->id_flow = ipsec_ids_next_flow; if (++ipsec_ids_next_flow == 0) ipsec_ids_next_flow = 1; if (ipsec_ids_next_flow == start_flow) { RBT_REMOVE(ipsec_ids_tree, &ipsec_ids_tree, ids); mtx_leave(&ipsec_flows_mtx); DPRINTF("ipsec_ids_next_flow exhausted %u", start_flow); return NULL; } } mtx_leave(&ipsec_flows_mtx); DPRINTF("new ids %p flow %u", ids, ids->id_flow); return ids; } struct ipsec_ids * ipsp_ids_lookup(u_int32_t ipsecflowinfo) { struct ipsec_ids key; struct ipsec_ids *ids; key.id_flow = ipsecflowinfo; mtx_enter(&ipsec_flows_mtx); ids = RBT_FIND(ipsec_ids_flows, &ipsec_ids_flows, &key); if (ids != NULL) { if (ids->id_refcount != 0) ids->id_refcount++; else ids = NULL; } mtx_leave(&ipsec_flows_mtx); return ids; } /* free ids only from delayed timeout */ void ipsp_ids_gc(void *arg) { struct ipsec_ids *ids, *tids; mtx_enter(&ipsec_flows_mtx); LIST_FOREACH_SAFE(ids, &ipsp_ids_gc_list, id_gc_list, tids) { KASSERT(ids->id_refcount == 0); DPRINTF("ids %p count %d", ids, ids->id_refcount); if ((--ids->id_gc_ttl) > 0) continue; LIST_REMOVE(ids, id_gc_list); RBT_REMOVE(ipsec_ids_tree, &ipsec_ids_tree, ids); RBT_REMOVE(ipsec_ids_flows, &ipsec_ids_flows, ids); free(ids->id_local, M_CREDENTIALS, 0); free(ids->id_remote, M_CREDENTIALS, 0); free(ids, M_CREDENTIALS, 0); } if (!LIST_EMPTY(&ipsp_ids_gc_list)) timeout_add_sec(&ipsp_ids_gc_timeout, 1); mtx_leave(&ipsec_flows_mtx); } /* decrements refcount, actual free happens in gc */ void ipsp_ids_free(struct ipsec_ids *ids) { if (ids == NULL) return; mtx_enter(&ipsec_flows_mtx); /* * If the refcount becomes zero, then a timeout is started. This * timeout must be cancelled if refcount is increased from zero. */ DPRINTF("ids %p count %d", ids, ids->id_refcount); KASSERT(ids->id_refcount > 0); if ((--ids->id_refcount) > 0) { mtx_leave(&ipsec_flows_mtx); return; } /* * Add second for the case ipsp_ids_gc() is already running and * awaits netlock to be released. */ ids->id_gc_ttl = ipsec_ids_idle + 1; if (LIST_EMPTY(&ipsp_ids_gc_list)) timeout_add_sec(&ipsp_ids_gc_timeout, 1); LIST_INSERT_HEAD(&ipsp_ids_gc_list, ids, id_gc_list); mtx_leave(&ipsec_flows_mtx); } static int ipsp_id_cmp(struct ipsec_id *a, struct ipsec_id *b) { if (a->type > b->type) return 1; if (a->type < b->type) return -1; if (a->len > b->len) return 1; if (a->len < b->len) return -1; return memcmp(a + 1, b + 1, a->len); } static inline int ipsp_ids_cmp(const struct ipsec_ids *a, const struct ipsec_ids *b) { int ret; ret = ipsp_id_cmp(a->id_remote, b->id_remote); if (ret != 0) return ret; return ipsp_id_cmp(a->id_local, b->id_local); } static inline int ipsp_ids_flow_cmp(const struct ipsec_ids *a, const struct ipsec_ids *b) { if (a->id_flow > b->id_flow) return 1; if (a->id_flow < b->id_flow) return -1; return 0; }
18 2 1 1 2 12 4 2 1 1 4 2 1 1 5 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 /* $OpenBSD: ipsec_input.c,v 1.203 2022/02/22 01:35:40 guenther Exp $ */ /* * The authors of this code are John Ioannidis (ji@tla.org), * Angelos D. Keromytis (kermit@csd.uch.gr) and * Niels Provos (provos@physnet.uni-hamburg.de). * * This code was written by John Ioannidis for BSD/OS in Athens, Greece, * in November 1995. * * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996, * by Angelos D. Keromytis. * * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis * and Niels Provos. * * Additional features in 1999 by Angelos D. Keromytis. * * Copyright (C) 1995, 1996, 1997, 1998, 1999 by John Ioannidis, * Angelos D. Keromytis and Niels Provos. * Copyright (c) 2001, Angelos D. Keromytis. * * Permission to use, copy, and modify this software with or without fee * is hereby granted, provided that this entire notice is included in * all copies of any software which is or includes a copy or * modification of this software. * You may use this code under the GNU public license if you so wish. Please * contribute changes back to the authors under this freer than GPL license * so that we may further the use of strong encryption without limitations to * all. * * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR * PURPOSE. */ #include "pf.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/protosw.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/sysctl.h> #include <sys/kernel.h> #include <sys/timeout.h> #include <net/if.h> #include <net/if_var.h> #include <net/netisr.h> #include <net/bpf.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/ip_icmp.h> #include <netinet/tcp.h> #include <netinet/udp.h> #if NPF > 0 #include <net/pfvar.h> #endif #ifdef INET6 #include <netinet6/in6_var.h> #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #endif /* INET6 */ #include <netinet/ip_ipsp.h> #include <netinet/ip_esp.h> #include <netinet/ip_ah.h> #include <netinet/ip_ipcomp.h> #include <net/if_enc.h> #include <crypto/cryptodev.h> #include <crypto/xform.h> #include "bpfilter.h" void ipsec_common_ctlinput(u_int, int, struct sockaddr *, void *, int); #ifdef ENCDEBUG #define DPRINTF(fmt, args...) \ do { \ if (encdebug) \ printf("%s: " fmt "\n", __func__, ## args); \ } while (0) #else #define DPRINTF(fmt, args...) \ do { } while (0) #endif /* sysctl variables */ int encdebug = 0; int ipsec_keep_invalid = IPSEC_DEFAULT_EMBRYONIC_SA_TIMEOUT; int ipsec_require_pfs = IPSEC_DEFAULT_PFS; int ipsec_soft_allocations = IPSEC_DEFAULT_SOFT_ALLOCATIONS; int ipsec_exp_allocations = IPSEC_DEFAULT_EXP_ALLOCATIONS; int ipsec_soft_bytes = IPSEC_DEFAULT_SOFT_BYTES; int ipsec_exp_bytes = IPSEC_DEFAULT_EXP_BYTES; int ipsec_soft_timeout = IPSEC_DEFAULT_SOFT_TIMEOUT; int ipsec_exp_timeout = IPSEC_DEFAULT_EXP_TIMEOUT; int ipsec_soft_first_use = IPSEC_DEFAULT_SOFT_FIRST_USE; int ipsec_exp_first_use = IPSEC_DEFAULT_EXP_FIRST_USE; int ipsec_expire_acquire = IPSEC_DEFAULT_EXPIRE_ACQUIRE; int esp_enable = 1; int ah_enable = 1; int ipcomp_enable = 0; const struct sysctl_bounded_args espctl_vars[] = { {ESPCTL_ENABLE, &esp_enable, 0, 1}, {ESPCTL_UDPENCAP_ENABLE, &udpencap_enable, 0, 1}, {ESPCTL_UDPENCAP_PORT, &udpencap_port, 0, 65535}, }; const struct sysctl_bounded_args ahctl_vars[] = { {AHCTL_ENABLE, &ah_enable, 0, 1}, }; const struct sysctl_bounded_args ipcompctl_vars[] = { {IPCOMPCTL_ENABLE, &ipcomp_enable, 0, 1}, }; struct cpumem *espcounters; struct cpumem *ahcounters; struct cpumem *ipcompcounters; struct cpumem *ipseccounters; char ipsec_def_enc[20]; char ipsec_def_auth[20]; char ipsec_def_comp[20]; const struct sysctl_bounded_args ipsecctl_vars[] = { { IPSEC_ENCDEBUG, &encdebug, 0, 1 }, { IPSEC_EXPIRE_ACQUIRE, &ipsec_expire_acquire, 0, INT_MAX }, { IPSEC_EMBRYONIC_SA_TIMEOUT, &ipsec_keep_invalid, 0, INT_MAX }, { IPSEC_REQUIRE_PFS, &ipsec_require_pfs, 0, 1 }, { IPSEC_SOFT_ALLOCATIONS, &ipsec_soft_allocations, 0, INT_MAX }, { IPSEC_ALLOCATIONS, &ipsec_exp_allocations, 0, INT_MAX }, { IPSEC_SOFT_BYTES, &ipsec_soft_bytes, 0, INT_MAX }, { IPSEC_BYTES, &ipsec_exp_bytes, 0, INT_MAX }, { IPSEC_TIMEOUT, &ipsec_exp_timeout, 0, INT_MAX }, { IPSEC_SOFT_TIMEOUT, &ipsec_soft_timeout,0, INT_MAX }, { IPSEC_SOFT_FIRSTUSE, &ipsec_soft_first_use, 0, INT_MAX }, { IPSEC_FIRSTUSE, &ipsec_exp_first_use, 0, INT_MAX }, }; int esp_sysctl_espstat(void *, size_t *, void *); int ah_sysctl_ahstat(void *, size_t *, void *); int ipcomp_sysctl_ipcompstat(void *, size_t *, void *); int ipsec_sysctl_ipsecstat(void *, size_t *, void *); void ipsec_init(void) { espcounters = counters_alloc(esps_ncounters); ahcounters = counters_alloc(ahs_ncounters); ipcompcounters = counters_alloc(ipcomps_ncounters); ipseccounters = counters_alloc(ipsec_ncounters); strlcpy(ipsec_def_enc, IPSEC_DEFAULT_DEF_ENC, sizeof(ipsec_def_enc)); strlcpy(ipsec_def_auth, IPSEC_DEFAULT_DEF_AUTH, sizeof(ipsec_def_auth)); strlcpy(ipsec_def_comp, IPSEC_DEFAULT_DEF_COMP, sizeof(ipsec_def_comp)); ipsp_init(); } /* * ipsec_common_input() gets called when we receive an IPsec-protected packet * in IPv4 or IPv6. All it does is find the right TDB and call the appropriate * transform. The callback takes care of further processing (like ingress * filtering). */ int ipsec_common_input(struct mbuf **mp, int skip, int protoff, int af, int sproto, int udpencap) { #define IPSEC_ISTAT(x,y,z) do { \ if (sproto == IPPROTO_ESP) \ espstat_inc(x); \ else if (sproto == IPPROTO_AH) \ ahstat_inc(y); \ else \ ipcompstat_inc(z); \ } while (0) struct mbuf *m = *mp; union sockaddr_union dst_address; struct tdb *tdbp = NULL; u_int32_t spi; u_int16_t cpi; int prot; #ifdef ENCDEBUG char buf[INET6_ADDRSTRLEN]; #endif NET_ASSERT_LOCKED(); ipsecstat_pkt(ipsec_ipackets, ipsec_ibytes, m->m_pkthdr.len); IPSEC_ISTAT(esps_input, ahs_input, ipcomps_input); if ((sproto == IPPROTO_IPCOMP) && (m->m_flags & M_COMP)) { DPRINTF("repeated decompression"); ipcompstat_inc(ipcomps_pdrops); goto drop; } if (m->m_pkthdr.len - skip < 2 * sizeof(u_int32_t)) { DPRINTF("packet too small"); IPSEC_ISTAT(esps_hdrops, ahs_hdrops, ipcomps_hdrops); goto drop; } /* Retrieve the SPI from the relevant IPsec header */ switch (sproto) { case IPPROTO_ESP: m_copydata(m, skip, sizeof(u_int32_t), (caddr_t) &spi); break; case IPPROTO_AH: m_copydata(m, skip + sizeof(u_int32_t), sizeof(u_int32_t), (caddr_t) &spi); break; case IPPROTO_IPCOMP: m_copydata(m, skip + sizeof(u_int16_t), sizeof(u_int16_t), (caddr_t) &cpi); spi = ntohl(htons(cpi)); break; default: panic("%s: unknown/unsupported security protocol %d", __func__, sproto); } /* * Find tunnel control block and (indirectly) call the appropriate * kernel crypto routine. The resulting mbuf chain is a valid * IP packet ready to go through input processing. */ memset(&dst_address, 0, sizeof(dst_address)); dst_address.sa.sa_family = af; switch (af) { case AF_INET: dst_address.sin.sin_len = sizeof(struct sockaddr_in); m_copydata(m, offsetof(struct ip, ip_dst), sizeof(struct in_addr), (caddr_t) &(dst_address.sin.sin_addr)); break; #ifdef INET6 case AF_INET6: dst_address.sin6.sin6_len = sizeof(struct sockaddr_in6); m_copydata(m, offsetof(struct ip6_hdr, ip6_dst), sizeof(struct in6_addr), (caddr_t) &(dst_address.sin6.sin6_addr)); in6_recoverscope(&dst_address.sin6, &dst_address.sin6.sin6_addr); break; #endif /* INET6 */ default: DPRINTF("unsupported protocol family %d", af); IPSEC_ISTAT(esps_nopf, ahs_nopf, ipcomps_nopf); goto drop; } tdbp = gettdb(rtable_l2(m->m_pkthdr.ph_rtableid), spi, &dst_address, sproto); if (tdbp == NULL) { DPRINTF("could not find SA for packet to %s, spi %08x", ipsp_address(&dst_address, buf, sizeof(buf)), ntohl(spi)); IPSEC_ISTAT(esps_notdb, ahs_notdb, ipcomps_notdb); goto drop; } if (tdbp->tdb_flags & TDBF_INVALID) { DPRINTF("attempted to use invalid SA %s/%08x/%u", ipsp_address(&dst_address, buf, sizeof(buf)), ntohl(spi), tdbp->tdb_sproto); IPSEC_ISTAT(esps_invalid, ahs_invalid, ipcomps_invalid); goto drop; } if (udpencap && !(tdbp->tdb_flags & TDBF_UDPENCAP)) { DPRINTF("attempted to use non-udpencap SA %s/%08x/%u", ipsp_address(&dst_address, buf, sizeof(buf)), ntohl(spi), tdbp->tdb_sproto); espstat_inc(esps_udpinval); goto drop; } if (!udpencap && (tdbp->tdb_flags & TDBF_UDPENCAP)) { DPRINTF("attempted to use udpencap SA %s/%08x/%u", ipsp_address(&dst_address, buf, sizeof(buf)), ntohl(spi), tdbp->tdb_sproto); espstat_inc(esps_udpneeded); goto drop; } if (tdbp->tdb_xform == NULL) { DPRINTF("attempted to use uninitialized SA %s/%08x/%u", ipsp_address(&dst_address, buf, sizeof(buf)), ntohl(spi), tdbp->tdb_sproto); IPSEC_ISTAT(esps_noxform, ahs_noxform, ipcomps_noxform); goto drop; } KERNEL_LOCK(); /* Register first use, setup expiration timer. */ if (tdbp->tdb_first_use == 0) { tdbp->tdb_first_use = gettime(); if (tdbp->tdb_flags & TDBF_FIRSTUSE) { if (timeout_add_sec(&tdbp->tdb_first_tmo, tdbp->tdb_exp_first_use)) tdb_ref(tdbp); } if (tdbp->tdb_flags & TDBF_SOFT_FIRSTUSE) { if (timeout_add_sec(&tdbp->tdb_sfirst_tmo, tdbp->tdb_soft_first_use)) tdb_ref(tdbp); } } tdbstat_pkt(tdbp, tdb_ipackets, tdb_ibytes, m->m_pkthdr.len); /* * Call appropriate transform and return -- callback takes care of * everything else. */ prot = (*(tdbp->tdb_xform->xf_input))(mp, tdbp, skip, protoff); if (prot == IPPROTO_DONE) { ipsecstat_inc(ipsec_idrops); tdbstat_inc(tdbp, tdb_idrops); } tdb_unref(tdbp); KERNEL_UNLOCK(); return prot; drop: m_freemp(mp); ipsecstat_inc(ipsec_idrops); if (tdbp != NULL) tdbstat_inc(tdbp, tdb_idrops); tdb_unref(tdbp); return IPPROTO_DONE; } /* * IPsec input callback, called by the transform callback. Takes care of * filtering and other sanity checks on the processed packet. */ int ipsec_common_input_cb(struct mbuf **mp, struct tdb *tdbp, int skip, int protoff) { struct mbuf *m = *mp; int af, sproto; u_int8_t prot; #if NBPFILTER > 0 struct ifnet *encif; #endif struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif /* INET6 */ struct m_tag *mtag; struct tdb_ident *tdbi; #ifdef ENCDEBUG char buf[INET6_ADDRSTRLEN]; #endif af = tdbp->tdb_dst.sa.sa_family; sproto = tdbp->tdb_sproto; tdbp->tdb_last_used = gettime(); /* Fix IPv4 header */ if (af == AF_INET) { if (m->m_len < skip && (m = *mp = m_pullup(m, skip)) == NULL) { DPRINTF("processing failed for SA %s/%08x", ipsp_address(&tdbp->tdb_dst, buf, sizeof(buf)), ntohl(tdbp->tdb_spi)); IPSEC_ISTAT(esps_hdrops, ahs_hdrops, ipcomps_hdrops); goto baddone; } ip = mtod(m, struct ip *); ip->ip_len = htons(m->m_pkthdr.len); ip->ip_sum = 0; ip->ip_sum = in_cksum(m, ip->ip_hl << 2); prot = ip->ip_p; } #ifdef INET6 /* Fix IPv6 header */ if (af == AF_INET6) { if (m->m_len < sizeof(struct ip6_hdr) && (m = *mp = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { DPRINTF("processing failed for SA %s/%08x", ipsp_address(&tdbp->tdb_dst, buf, sizeof(buf)), ntohl(tdbp->tdb_spi)); IPSEC_ISTAT(esps_hdrops, ahs_hdrops, ipcomps_hdrops); goto baddone; } ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(m->m_pkthdr.len - skip); /* Save protocol */ m_copydata(m, protoff, 1, (caddr_t) &prot); } #endif /* INET6 */ /* * Fix TCP/UDP checksum of UDP encapsulated transport mode ESP packet. * (RFC3948 3.1.2) */ if ((af == AF_INET || af == AF_INET6) && (tdbp->tdb_flags & TDBF_UDPENCAP) && (tdbp->tdb_flags & TDBF_TUNNELING) == 0) { u_int16_t cksum; switch (prot) { case IPPROTO_UDP: if (m->m_pkthdr.len < skip + sizeof(struct udphdr)) { IPSEC_ISTAT(esps_hdrops, ahs_hdrops, ipcomps_hdrops); goto baddone; } cksum = 0; m_copyback(m, skip + offsetof(struct udphdr, uh_sum), sizeof(cksum), &cksum, M_NOWAIT); #ifdef INET6 if (af == AF_INET6) { cksum = in6_cksum(m, IPPROTO_UDP, skip, m->m_pkthdr.len - skip); m_copyback(m, skip + offsetof(struct udphdr, uh_sum), sizeof(cksum), &cksum, M_NOWAIT); } #endif break; case IPPROTO_TCP: if (m->m_pkthdr.len < skip + sizeof(struct tcphdr)) { IPSEC_ISTAT(esps_hdrops, ahs_hdrops, ipcomps_hdrops); goto baddone; } cksum = 0; m_copyback(m, skip + offsetof(struct tcphdr, th_sum), sizeof(cksum), &cksum, M_NOWAIT); if (af == AF_INET) cksum = in4_cksum(m, IPPROTO_TCP, skip, m->m_pkthdr.len - skip); #ifdef INET6 else if (af == AF_INET6) cksum = in6_cksum(m, IPPROTO_TCP, skip, m->m_pkthdr.len - skip); #endif m_copyback(m, skip + offsetof(struct tcphdr, th_sum), sizeof(cksum), &cksum, M_NOWAIT); break; } } /* * Record what we've done to the packet (under what SA it was * processed). */ if (tdbp->tdb_sproto != IPPROTO_IPCOMP) { mtag = m_tag_get(PACKET_TAG_IPSEC_IN_DONE, sizeof(struct tdb_ident), M_NOWAIT); if (mtag == NULL) { DPRINTF("failed to get tag"); IPSEC_ISTAT(esps_hdrops, ahs_hdrops, ipcomps_hdrops); goto baddone; } tdbi = (struct tdb_ident *)(mtag + 1); tdbi->dst = tdbp->tdb_dst; tdbi->proto = tdbp->tdb_sproto; tdbi->spi = tdbp->tdb_spi; tdbi->rdomain = tdbp->tdb_rdomain; m_tag_prepend(m, mtag); } switch (sproto) { case IPPROTO_ESP: /* Packet is confidential ? */ if (tdbp->tdb_encalgxform) m->m_flags |= M_CONF; /* Check if we had authenticated ESP. */ if (tdbp->tdb_authalgxform) m->m_flags |= M_AUTH; break; case IPPROTO_AH: m->m_flags |= M_AUTH; break; case IPPROTO_IPCOMP: m->m_flags |= M_COMP; break; default: panic("%s: unknown/unsupported security protocol %d", __func__, sproto); } #if NPF > 0 /* Add pf tag if requested. */ pf_tag_packet(m, tdbp->tdb_tag, -1); pf_pkt_addr_changed(m); #endif if (tdbp->tdb_rdomain != tdbp->tdb_rdomain_post) m->m_pkthdr.ph_rtableid = tdbp->tdb_rdomain_post; if (tdbp->tdb_flags & TDBF_TUNNELING) m->m_flags |= M_TUNNEL; ipsecstat_add(ipsec_idecompbytes, m->m_pkthdr.len); tdbstat_add(tdbp, tdb_idecompbytes, m->m_pkthdr.len); #if NBPFILTER > 0 encif = enc_getif(tdbp->tdb_rdomain_post, tdbp->tdb_tap); if (encif != NULL) { encif->if_ipackets++; encif->if_ibytes += m->m_pkthdr.len; if (sproto != IPPROTO_IPCOMP) { /* XXX This conflicts with the scoped nature of IPv6 */ m->m_pkthdr.ph_ifidx = encif->if_index; } if (encif->if_bpf) { struct enchdr hdr; hdr.af = af; hdr.spi = tdbp->tdb_spi; hdr.flags = m->m_flags & (M_AUTH|M_CONF); bpf_mtap_hdr(encif->if_bpf, (char *)&hdr, ENC_HDRLEN, m, BPF_DIRECTION_IN); } } #endif #if NPF > 0 /* * The ip_deliver() shortcut avoids running through ip_input() with the * same IP header twice. Packets in transport mode have to be be * passed to pf explicitly. In tunnel mode the inner IP header will * run through ip_input() and pf anyway. */ if ((tdbp->tdb_flags & TDBF_TUNNELING) == 0) { struct ifnet *ifp; /* This is the enc0 interface unless for ipcomp. */ if ((ifp = if_get(m->m_pkthdr.ph_ifidx)) == NULL) { goto baddone; } if (pf_test(af, PF_IN, ifp, mp) != PF_PASS) { if_put(ifp); goto baddone; } m = *mp; if_put(ifp); if (m == NULL) return IPPROTO_DONE; } #endif /* Return to the appropriate protocol handler in deliver loop. */ return prot; baddone: m_freemp(mp); return IPPROTO_DONE; #undef IPSEC_ISTAT } int ipsec_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { int error; switch (name[0]) { case IPCTL_IPSEC_ENC_ALGORITHM: NET_LOCK(); error = sysctl_tstring(oldp, oldlenp, newp, newlen, ipsec_def_enc, sizeof(ipsec_def_enc)); NET_UNLOCK(); return (error); case IPCTL_IPSEC_AUTH_ALGORITHM: NET_LOCK(); error = sysctl_tstring(oldp, oldlenp, newp, newlen, ipsec_def_auth, sizeof(ipsec_def_auth)); NET_UNLOCK(); return (error); case IPCTL_IPSEC_IPCOMP_ALGORITHM: NET_LOCK(); error = sysctl_tstring(oldp, oldlenp, newp, newlen, ipsec_def_comp, sizeof(ipsec_def_comp)); NET_UNLOCK(); return (error); case IPCTL_IPSEC_STATS: return (ipsec_sysctl_ipsecstat(oldp, oldlenp, newp)); default: NET_LOCK(); error = sysctl_bounded_arr(ipsecctl_vars, nitems(ipsecctl_vars), name, namelen, oldp, oldlenp, newp, newlen); NET_UNLOCK(); return (error); } } int esp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { int error; /* All sysctl names at this level are terminal. */ if (namelen != 1) return (ENOTDIR); switch (name[0]) { case ESPCTL_STATS: return (esp_sysctl_espstat(oldp, oldlenp, newp)); default: NET_LOCK(); error = sysctl_bounded_arr(espctl_vars, nitems(espctl_vars), name, namelen, oldp, oldlenp, newp, newlen); NET_UNLOCK(); return (error); } } int esp_sysctl_espstat(void *oldp, size_t *oldlenp, void *newp) { struct espstat espstat; CTASSERT(sizeof(espstat) == (esps_ncounters * sizeof(uint64_t))); memset(&espstat, 0, sizeof espstat); counters_read(espcounters, (uint64_t *)&espstat, esps_ncounters); return (sysctl_rdstruct(oldp, oldlenp, newp, &espstat, sizeof(espstat))); } int ah_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { int error; /* All sysctl names at this level are terminal. */ if (namelen != 1) return (ENOTDIR); switch (name[0]) { case AHCTL_STATS: return ah_sysctl_ahstat(oldp, oldlenp, newp); default: NET_LOCK(); error = sysctl_bounded_arr(ahctl_vars, nitems(ahctl_vars), name, namelen, oldp, oldlenp, newp, newlen); NET_UNLOCK(); return (error); } } int ah_sysctl_ahstat(void *oldp, size_t *oldlenp, void *newp) { struct ahstat ahstat; CTASSERT(sizeof(ahstat) == (ahs_ncounters * sizeof(uint64_t))); memset(&ahstat, 0, sizeof ahstat); counters_read(ahcounters, (uint64_t *)&ahstat, ahs_ncounters); return (sysctl_rdstruct(oldp, oldlenp, newp, &ahstat, sizeof(ahstat))); } int ipcomp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { int error; /* All sysctl names at this level are terminal. */ if (namelen != 1) return (ENOTDIR); switch (name[0]) { case IPCOMPCTL_STATS: return ipcomp_sysctl_ipcompstat(oldp, oldlenp, newp); default: NET_LOCK(); error = sysctl_bounded_arr(ipcompctl_vars, nitems(ipcompctl_vars), name, namelen, oldp, oldlenp, newp, newlen); NET_UNLOCK(); return (error); } } int ipcomp_sysctl_ipcompstat(void *oldp, size_t *oldlenp, void *newp) { struct ipcompstat ipcompstat; CTASSERT(sizeof(ipcompstat) == (ipcomps_ncounters * sizeof(uint64_t))); memset(&ipcompstat, 0, sizeof ipcompstat); counters_read(ipcompcounters, (uint64_t *)&ipcompstat, ipcomps_ncounters); return (sysctl_rdstruct(oldp, oldlenp, newp, &ipcompstat, sizeof(ipcompstat))); } int ipsec_sysctl_ipsecstat(void *oldp, size_t *oldlenp, void *newp) { struct ipsecstat ipsecstat; CTASSERT(sizeof(ipsecstat) == (ipsec_ncounters * sizeof(uint64_t))); memset(&ipsecstat, 0, sizeof ipsecstat); counters_read(ipseccounters, (uint64_t *)&ipsecstat, ipsec_ncounters); return (sysctl_rdstruct(oldp, oldlenp, newp, &ipsecstat, sizeof(ipsecstat))); } int ipsec_input_disabled(struct mbuf **mp, int *offp, int proto, int af) { switch (af) { case AF_INET: return rip_input(mp, offp, proto, af); #ifdef INET6 case AF_INET6: return rip6_input(mp, offp, proto, af); #endif default: unhandled_af(af); } } int ah46_input(struct mbuf **mp, int *offp, int proto, int af) { int protoff; if ( #if NPF > 0 ((*mp)->m_pkthdr.pf.flags & PF_TAG_DIVERTED) || #endif !ah_enable) return ipsec_input_disabled(mp, offp, proto, af); protoff = ipsec_protoff(*mp, *offp, af); if (protoff < 0) { DPRINTF("bad packet header chain"); ahstat_inc(ahs_hdrops); m_freemp(mp); return IPPROTO_DONE; } return ipsec_common_input(mp, *offp, protoff, af, proto, 0); } void ah4_ctlinput(int cmd, struct sockaddr *sa, u_int rdomain, void *v) { if (sa->sa_family != AF_INET || sa->sa_len != sizeof(struct sockaddr_in)) return; ipsec_common_ctlinput(rdomain, cmd, sa, v, IPPROTO_AH); } int esp46_input(struct mbuf **mp, int *offp, int proto, int af) { int protoff; if ( #if NPF > 0 ((*mp)->m_pkthdr.pf.flags & PF_TAG_DIVERTED) || #endif !esp_enable) return ipsec_input_disabled(mp, offp, proto, af); protoff = ipsec_protoff(*mp, *offp, af); if (protoff < 0) { DPRINTF("bad packet header chain"); espstat_inc(esps_hdrops); m_freemp(mp); return IPPROTO_DONE; } return ipsec_common_input(mp, *offp, protoff, af, proto, 0); } /* IPv4 IPCOMP wrapper */ int ipcomp46_input(struct mbuf **mp, int *offp, int proto, int af) { int protoff; if ( #if NPF > 0 ((*mp)->m_pkthdr.pf.flags & PF_TAG_DIVERTED) || #endif !ipcomp_enable) return ipsec_input_disabled(mp, offp, proto, af); protoff = ipsec_protoff(*mp, *offp, af); if (protoff < 0) { DPRINTF("bad packet header chain"); ipcompstat_inc(ipcomps_hdrops); m_freemp(mp); return IPPROTO_DONE; } return ipsec_common_input(mp, *offp, protoff, af, proto, 0); } void ipsec_set_mtu(struct tdb *tdbp, u_int32_t mtu) { ssize_t adjust; NET_ASSERT_LOCKED(); /* Walk the chain backwards to the first tdb */ for (; tdbp != NULL; tdbp = tdbp->tdb_inext) { if (tdbp->tdb_flags & TDBF_INVALID || (adjust = ipsec_hdrsz(tdbp)) == -1) return; mtu -= adjust; /* Store adjusted MTU in tdb */ tdbp->tdb_mtu = mtu; tdbp->tdb_mtutimeout = gettime() + ip_mtudisc_timeout; DPRINTF("spi %08x mtu %d adjust %ld", ntohl(tdbp->tdb_spi), tdbp->tdb_mtu, adjust); } } void ipsec_common_ctlinput(u_int rdomain, int cmd, struct sockaddr *sa, void *v, int proto) { struct ip *ip = v; if (cmd == PRC_MSGSIZE && ip && ip_mtudisc && ip->ip_v == 4) { struct tdb *tdbp; struct sockaddr_in dst; struct icmp *icp; int hlen = ip->ip_hl << 2; u_int32_t spi, mtu; /* Find the right MTU. */ icp = (struct icmp *)((caddr_t) ip - offsetof(struct icmp, icmp_ip)); mtu = ntohs(icp->icmp_nextmtu); /* * Ignore the packet, if we do not receive a MTU * or the MTU is too small to be acceptable. */ if (mtu < 296) return; memset(&dst, 0, sizeof(struct sockaddr_in)); dst.sin_family = AF_INET; dst.sin_len = sizeof(struct sockaddr_in); dst.sin_addr.s_addr = ip->ip_dst.s_addr; memcpy(&spi, (caddr_t)ip + hlen, sizeof(u_int32_t)); tdbp = gettdb_rev(rdomain, spi, (union sockaddr_union *)&dst, proto); ipsec_set_mtu(tdbp, mtu); tdb_unref(tdbp); } } void udpencap_ctlinput(int cmd, struct sockaddr *sa, u_int rdomain, void *v) { struct ip *ip = v; struct tdb *tdbp, *first; struct icmp *icp; u_int32_t mtu; struct sockaddr_in dst, src; union sockaddr_union *su_dst, *su_src; NET_ASSERT_LOCKED(); icp = (struct icmp *)((caddr_t) ip - offsetof(struct icmp, icmp_ip)); mtu = ntohs(icp->icmp_nextmtu); /* * Ignore the packet, if we do not receive a MTU * or the MTU is too small to be acceptable. */ if (mtu < 296) return; memset(&dst, 0, sizeof(dst)); dst.sin_family = AF_INET; dst.sin_len = sizeof(struct sockaddr_in); dst.sin_addr.s_addr = ip->ip_dst.s_addr; su_dst = (union sockaddr_union *)&dst; memset(&src, 0, sizeof(src)); src.sin_family = AF_INET; src.sin_len = sizeof(struct sockaddr_in); src.sin_addr.s_addr = ip->ip_src.s_addr; su_src = (union sockaddr_union *)&src; first = gettdbbysrcdst_rev(rdomain, 0, su_src, su_dst, IPPROTO_ESP); mtx_enter(&tdb_sadb_mtx); for (tdbp = first; tdbp != NULL; tdbp = tdbp->tdb_snext) { if (tdbp->tdb_sproto == IPPROTO_ESP && ((tdbp->tdb_flags & (TDBF_INVALID|TDBF_UDPENCAP)) == TDBF_UDPENCAP) && !memcmp(&tdbp->tdb_dst, &dst, su_dst->sa.sa_len) && !memcmp(&tdbp->tdb_src, &src, su_src->sa.sa_len)) ipsec_set_mtu(tdbp, mtu); } mtx_leave(&tdb_sadb_mtx); tdb_unref(first); } void esp4_ctlinput(int cmd, struct sockaddr *sa, u_int rdomain, void *v) { if (sa->sa_family != AF_INET || sa->sa_len != sizeof(struct sockaddr_in)) return; ipsec_common_ctlinput(rdomain, cmd, sa, v, IPPROTO_ESP); } /* Find the offset of the next protocol field in the previous header. */ int ipsec_protoff(struct mbuf *m, int off, int af) { #ifdef INET6 struct ip6_ext ip6e; int protoff, nxt, l; #endif /* INET6 */ switch (af) { case AF_INET: return offsetof(struct ip, ip_p); #ifdef INET6 case AF_INET6: break; #endif /* INET6 */ default: unhandled_af(af); } #ifdef INET6 if (off < sizeof(struct ip6_hdr)) return -1; if (off == sizeof(struct ip6_hdr)) return offsetof(struct ip6_hdr, ip6_nxt); /* Chase down the header chain... */ protoff = sizeof(struct ip6_hdr); nxt = (mtod(m, struct ip6_hdr *))->ip6_nxt; l = 0; do { protoff += l; m_copydata(m, protoff, sizeof(ip6e), (caddr_t) &ip6e); if (nxt == IPPROTO_AH) l = (ip6e.ip6e_len + 2) << 2; else l = (ip6e.ip6e_len + 1) << 3; #ifdef DIAGNOSTIC if (l <= 0) panic("%s: l went zero or negative", __func__); #endif nxt = ip6e.ip6e_nxt; } while (protoff + l < off); /* Malformed packet check */ if (protoff + l != off) return -1; protoff += offsetof(struct ip6_ext, ip6e_nxt); return protoff; #endif /* INET6 */ } int ipsec_forward_check(struct mbuf *m, int hlen, int af) { struct tdb *tdb; struct tdb_ident *tdbi; struct m_tag *mtag; int error = 0; /* * IPsec policy check for forwarded packets. Look at * inner-most IPsec SA used. */ mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); if (mtag != NULL) { tdbi = (struct tdb_ident *)(mtag + 1); tdb = gettdb(tdbi->rdomain, tdbi->spi, &tdbi->dst, tdbi->proto); } else tdb = NULL; error = ipsp_spd_lookup(m, af, hlen, IPSP_DIRECTION_IN, tdb, NULL, NULL, NULL); tdb_unref(tdb); return error; } int ipsec_local_check(struct mbuf *m, int hlen, int proto, int af) { struct tdb *tdb; struct tdb_ident *tdbi; struct m_tag *mtag; int error = 0; /* * If it's a protected packet for us, skip the policy check. * That's because we really only care about the properties of * the protected packet, and not the intermediate versions. * While this is not the most paranoid setting, it allows * some flexibility in handling nested tunnels (in setting up * the policies). */ if ((proto == IPPROTO_ESP) || (proto == IPPROTO_AH) || (proto == IPPROTO_IPCOMP)) return 0; /* * If the protected packet was tunneled, then we need to * verify the protected packet's information, not the * external headers. Thus, skip the policy lookup for the * external packet, and keep the IPsec information linked on * the packet header (the encapsulation routines know how * to deal with that). */ if ((proto == IPPROTO_IPV4) || (proto == IPPROTO_IPV6)) return 0; /* * When processing IPv6 header chains, do not look at the * outer header. The inner protocol is relevant and will * be checked by the local delivery loop later. */ if ((af == AF_INET6) && ((proto == IPPROTO_DSTOPTS) || (proto == IPPROTO_ROUTING) || (proto == IPPROTO_FRAGMENT))) return 0; /* * If the protected packet is TCP or UDP, we'll do the * policy check in the respective input routine, so we can * check for bypass sockets. */ if ((proto == IPPROTO_TCP) || (proto == IPPROTO_UDP)) return 0; /* * IPsec policy check for local-delivery packets. Look at the * inner-most SA that protected the packet. This is in fact * a bit too restrictive (it could end up causing packets to * be dropped that semantically follow the policy, e.g., in * certain SA-bundle configurations); but the alternative is * very complicated (and requires keeping track of what * kinds of tunneling headers have been seen in-between the * IPsec headers), and I don't think we lose much functionality * that's needed in the real world (who uses bundles anyway ?). */ mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); if (mtag) { tdbi = (struct tdb_ident *)(mtag + 1); tdb = gettdb(tdbi->rdomain, tdbi->spi, &tdbi->dst, tdbi->proto); } else tdb = NULL; error = ipsp_spd_lookup(m, af, hlen, IPSP_DIRECTION_IN, tdb, NULL, NULL, NULL); tdb_unref(tdb); return error; }
1 1 1 3 2 1 1 1 1 6 5 23 20 3 3 3 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 /* $OpenBSD: nd6.c,v 1.246 2022/08/09 21:10:03 kn Exp $ */ /* $KAME: nd6.c,v 1.280 2002/06/08 19:52:07 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/timeout.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/sockio.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/pool.h> #include <sys/errno.h> #include <sys/ioctl.h> #include <sys/syslog.h> #include <sys/queue.h> #include <sys/stdint.h> #include <sys/task.h> #include <net/if.h> #include <net/if_dl.h> #include <net/if_types.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/if_ether.h> #include <netinet/ip_ipsp.h> #include <netinet6/in6_var.h> #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet6/nd6.h> #include <netinet/icmp6.h> #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */ #define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */ /* timer values */ int nd6_timer_next = -1; /* at which uptime nd6_timer runs */ time_t nd6_expire_next = -1; /* at which uptime nd6_expire runs */ int nd6_delay = 5; /* delay first probe time 5 second */ int nd6_umaxtries = 3; /* maximum unicast query */ int nd6_mmaxtries = 3; /* maximum multicast query */ int nd6_gctimer = (60 * 60 * 24); /* 1 day: garbage collection timer */ /* preventing too many loops in ND option parsing */ int nd6_maxndopt = 10; /* max # of ND options allowed */ int nd6_maxnudhint = 0; /* max # of subsequent upper layer hints */ #ifdef ND6_DEBUG int nd6_debug = 1; #else int nd6_debug = 0; #endif TAILQ_HEAD(llinfo_nd6_head, llinfo_nd6) nd6_list; struct pool nd6_pool; /* pool for llinfo_nd6 structures */ int nd6_inuse; void nd6_timer(void *); void nd6_slowtimo(void *); void nd6_expire(void *); void nd6_expire_timer(void *); void nd6_invalidate(struct rtentry *); void nd6_free(struct rtentry *); int nd6_llinfo_timer(struct rtentry *); struct timeout nd6_timer_to; struct timeout nd6_slowtimo_ch; struct timeout nd6_expire_timeout; struct task nd6_expire_task; void nd6_init(void) { static int nd6_init_done = 0; if (nd6_init_done) { log(LOG_NOTICE, "%s called more than once\n", __func__); return; } TAILQ_INIT(&nd6_list); pool_init(&nd6_pool, sizeof(struct llinfo_nd6), 0, IPL_SOFTNET, 0, "nd6", NULL); task_set(&nd6_expire_task, nd6_expire, NULL); nd6_init_done = 1; /* start timer */ timeout_set_proc(&nd6_timer_to, nd6_timer, NULL); timeout_set_proc(&nd6_slowtimo_ch, nd6_slowtimo, NULL); timeout_add_sec(&nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL); timeout_set(&nd6_expire_timeout, nd6_expire_timer, NULL); } struct nd_ifinfo * nd6_ifattach(struct ifnet *ifp) { struct nd_ifinfo *nd; nd = malloc(sizeof(*nd), M_IP6NDP, M_WAITOK | M_ZERO); nd->initialized = 1; nd->basereachable = REACHABLE_TIME; nd->reachable = ND_COMPUTE_RTIME(nd->basereachable); nd->retrans = RETRANS_TIMER; return nd; } void nd6_ifdetach(struct nd_ifinfo *nd) { free(nd, M_IP6NDP, sizeof(*nd)); } void nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts) { bzero(ndopts, sizeof(*ndopts)); ndopts->nd_opts_search = (struct nd_opt_hdr *)opt; ndopts->nd_opts_last = (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len); if (icmp6len == 0) { ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } } /* * Take one ND option. */ struct nd_opt_hdr * nd6_option(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int olen; if (!ndopts) panic("%s: ndopts == NULL", __func__); if (!ndopts->nd_opts_last) panic("%s: uninitialized ndopts", __func__); if (!ndopts->nd_opts_search) return NULL; if (ndopts->nd_opts_done) return NULL; nd_opt = ndopts->nd_opts_search; /* make sure nd_opt_len is inside the buffer */ if ((caddr_t)&nd_opt->nd_opt_len >= (caddr_t)ndopts->nd_opts_last) { bzero(ndopts, sizeof(*ndopts)); return NULL; } olen = nd_opt->nd_opt_len << 3; if (olen == 0) { /* * Message validation requires that all included * options have a length that is greater than zero. */ bzero(ndopts, sizeof(*ndopts)); return NULL; } ndopts->nd_opts_search = (struct nd_opt_hdr *)((caddr_t)nd_opt + olen); if (ndopts->nd_opts_search > ndopts->nd_opts_last) { /* option overruns the end of buffer, invalid */ bzero(ndopts, sizeof(*ndopts)); return NULL; } else if (ndopts->nd_opts_search == ndopts->nd_opts_last) { /* reached the end of options chain */ ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } return nd_opt; } /* * Parse multiple ND options. * This function is much easier to use, for ND routines that do not need * multiple options of the same type. */ int nd6_options(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int i = 0; if (!ndopts) panic("%s: ndopts == NULL", __func__); if (!ndopts->nd_opts_last) panic("%s: uninitialized ndopts", __func__); if (!ndopts->nd_opts_search) return 0; while (1) { nd_opt = nd6_option(ndopts); if (!nd_opt && !ndopts->nd_opts_last) { /* * Message validation requires that all included * options have a length that is greater than zero. */ icmp6stat_inc(icp6s_nd_badopt); bzero(ndopts, sizeof(*ndopts)); return -1; } if (!nd_opt) goto skip1; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LINKADDR: case ND_OPT_TARGET_LINKADDR: case ND_OPT_MTU: case ND_OPT_REDIRECTED_HEADER: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { nd6log((LOG_INFO, "duplicated ND6 option found (type=%d)\n", nd_opt->nd_opt_type)); /* XXX bark? */ } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } break; case ND_OPT_PREFIX_INFORMATION: if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } ndopts->nd_opts_pi_end = (struct nd_opt_prefix_info *)nd_opt; break; case ND_OPT_DNSSL: case ND_OPT_RDNSS: /* Don't warn */ break; default: /* * Unknown options must be silently ignored, * to accommodate future extension to the protocol. */ nd6log((LOG_DEBUG, "nd6_options: unsupported option %d - " "option ignored\n", nd_opt->nd_opt_type)); } skip1: i++; if (i > nd6_maxndopt) { icmp6stat_inc(icp6s_nd_toomanyopt); nd6log((LOG_INFO, "too many loop in nd opt\n")); break; } if (ndopts->nd_opts_done) break; } return 0; } /* * ND6 timer routine to handle ND6 entries */ void nd6_llinfo_settimer(const struct llinfo_nd6 *ln, unsigned int secs) { time_t expire = getuptime() + secs; NET_ASSERT_LOCKED(); KASSERT(!ISSET(ln->ln_rt->rt_flags, RTF_LOCAL)); ln->ln_rt->rt_expire = expire; if (!timeout_pending(&nd6_timer_to) || expire < nd6_timer_next) { nd6_timer_next = expire; timeout_add_sec(&nd6_timer_to, secs); } } void nd6_timer(void *unused) { struct llinfo_nd6 *ln, *nln; time_t expire = getuptime() + nd6_gctimer; int secs; NET_LOCK(); TAILQ_FOREACH_SAFE(ln, &nd6_list, ln_list, nln) { struct rtentry *rt = ln->ln_rt; if (rt->rt_expire && rt->rt_expire <= getuptime()) if (nd6_llinfo_timer(rt)) continue; if (rt->rt_expire && rt->rt_expire < expire) expire = rt->rt_expire; } secs = expire - getuptime(); if (secs < 0) secs = 0; if (!TAILQ_EMPTY(&nd6_list)) { nd6_timer_next = getuptime() + secs; timeout_add_sec(&nd6_timer_to, secs); } NET_UNLOCK(); } /* * ND timer state handling. * * Returns 1 if `rt' should no longer be used, 0 otherwise. */ int nd6_llinfo_timer(struct rtentry *rt) { struct llinfo_nd6 *ln = (struct llinfo_nd6 *)rt->rt_llinfo; struct sockaddr_in6 *dst = satosin6(rt_key(rt)); struct ifnet *ifp; struct nd_ifinfo *ndi = NULL; NET_ASSERT_LOCKED(); if ((ifp = if_get(rt->rt_ifidx)) == NULL) return 1; ndi = ND_IFINFO(ifp); switch (ln->ln_state) { case ND6_LLINFO_INCOMPLETE: if (ln->ln_asked < nd6_mmaxtries) { ln->ln_asked++; nd6_llinfo_settimer(ln, ndi->retrans / 1000); nd6_ns_output(ifp, NULL, &dst->sin6_addr, ln, 0); } else { struct mbuf *m = ln->ln_hold; if (m) { ln->ln_hold = NULL; /* * Fake rcvif to make the ICMP error * more helpful in diagnosing for the * receiver. * XXX: should we consider * older rcvif? */ m->m_pkthdr.ph_ifidx = rt->rt_ifidx; icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0); if (ln->ln_hold == m) { /* m is back in ln_hold. Discard. */ m_freem(ln->ln_hold); ln->ln_hold = NULL; } } nd6_free(rt); ln = NULL; } break; case ND6_LLINFO_REACHABLE: if (!ND6_LLINFO_PERMANENT(ln)) { ln->ln_state = ND6_LLINFO_STALE; nd6_llinfo_settimer(ln, nd6_gctimer); } break; case ND6_LLINFO_STALE: case ND6_LLINFO_PURGE: /* Garbage Collection(RFC 2461 5.3) */ if (!ND6_LLINFO_PERMANENT(ln)) { nd6_free(rt); ln = NULL; } break; case ND6_LLINFO_DELAY: if (ndi) { /* We need NUD */ ln->ln_asked = 1; ln->ln_state = ND6_LLINFO_PROBE; nd6_llinfo_settimer(ln, ndi->retrans / 1000); nd6_ns_output(ifp, &dst->sin6_addr, &dst->sin6_addr, ln, 0); } break; case ND6_LLINFO_PROBE: if (ln->ln_asked < nd6_umaxtries) { ln->ln_asked++; nd6_llinfo_settimer(ln, ndi->retrans / 1000); nd6_ns_output(ifp, &dst->sin6_addr, &dst->sin6_addr, ln, 0); } else { nd6_free(rt); ln = NULL; } break; } if_put(ifp); return (ln == NULL); } void nd6_expire_timer_update(struct in6_ifaddr *ia6) { time_t expire_time = INT64_MAX; int secs; KERNEL_ASSERT_LOCKED(); if (ia6->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) expire_time = ia6->ia6_lifetime.ia6t_expire; if (!(ia6->ia6_flags & IN6_IFF_DEPRECATED) && ia6->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME && expire_time > ia6->ia6_lifetime.ia6t_preferred) expire_time = ia6->ia6_lifetime.ia6t_preferred; if (expire_time == INT64_MAX) return; /* * IFA6_IS_INVALID() and IFA6_IS_DEPRECATED() check for uptime * greater than ia6t_expire or ia6t_preferred, not greater or equal. * Schedule timeout one second later so that either IFA6_IS_INVALID() * or IFA6_IS_DEPRECATED() is true. */ expire_time++; if (!timeout_pending(&nd6_expire_timeout) || nd6_expire_next > expire_time) { secs = expire_time - getuptime(); if (secs < 0) secs = 0; timeout_add_sec(&nd6_expire_timeout, secs); nd6_expire_next = expire_time; } } /* * Expire interface addresses. */ void nd6_expire(void *unused) { struct ifnet *ifp; KERNEL_LOCK(); NET_LOCK(); TAILQ_FOREACH(ifp, &ifnet, if_list) { struct ifaddr *ifa, *nifa; struct in6_ifaddr *ia6; TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrlist, ifa_list, nifa) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia6 = ifatoia6(ifa); /* check address lifetime */ if (IFA6_IS_INVALID(ia6)) { in6_purgeaddr(&ia6->ia_ifa); } else { if (IFA6_IS_DEPRECATED(ia6)) ia6->ia6_flags |= IN6_IFF_DEPRECATED; nd6_expire_timer_update(ia6); } } } NET_UNLOCK(); KERNEL_UNLOCK(); } void nd6_expire_timer(void *unused) { task_add(net_tq(0), &nd6_expire_task); } /* * Nuke neighbor cache/prefix/default router management table, right before * ifp goes away. */ void nd6_purge(struct ifnet *ifp) { struct llinfo_nd6 *ln, *nln; NET_ASSERT_LOCKED(); /* * Nuke neighbor cache entries for the ifp. */ TAILQ_FOREACH_SAFE(ln, &nd6_list, ln_list, nln) { struct rtentry *rt; struct sockaddr_dl *sdl; rt = ln->ln_rt; if (rt != NULL && rt->rt_gateway != NULL && rt->rt_gateway->sa_family == AF_LINK) { sdl = satosdl(rt->rt_gateway); if (sdl->sdl_index == ifp->if_index) nd6_free(rt); } } } struct rtentry * nd6_lookup(const struct in6_addr *addr6, int create, struct ifnet *ifp, u_int rtableid) { struct rtentry *rt; struct sockaddr_in6 sin6; int flags; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr6; flags = (create) ? RT_RESOLVE : 0; rt = rtalloc(sin6tosa(&sin6), flags, rtableid); if (rt != NULL && (rt->rt_flags & RTF_LLINFO) == 0) { /* * This is the case for the default route. * If we want to create a neighbor cache for the address, we * should free the route for the destination and allocate an * interface route. */ if (create) { rtfree(rt); rt = NULL; } } if (rt == NULL) { if (create && ifp) { struct rt_addrinfo info; struct ifaddr *ifa; int error; /* * If no route is available and create is set, * we allocate a host route for the destination * and treat it like an interface route. * This hack is necessary for a neighbor which can't * be covered by our own prefix. */ ifa = ifaof_ifpforaddr(sin6tosa(&sin6), ifp); if (ifa == NULL) return (NULL); /* * Create a new route. RTF_LLINFO is necessary * to create a Neighbor Cache entry for the * destination in nd6_rtrequest which will be * called in rtrequest. */ bzero(&info, sizeof(info)); info.rti_ifa = ifa; info.rti_flags = RTF_HOST | RTF_LLINFO; info.rti_info[RTAX_DST] = sin6tosa(&sin6); info.rti_info[RTAX_GATEWAY] = sdltosa(ifp->if_sadl); error = rtrequest(RTM_ADD, &info, RTP_CONNECTED, &rt, rtableid); if (error) return (NULL); if (rt->rt_llinfo != NULL) { struct llinfo_nd6 *ln = (struct llinfo_nd6 *)rt->rt_llinfo; ln->ln_state = ND6_LLINFO_NOSTATE; } } else return (NULL); } /* * Validation for the entry. * Note that the check for rt_llinfo is necessary because a cloned * route from a parent route that has the L flag (e.g. the default * route to a p2p interface) may have the flag, too, while the * destination is not actually a neighbor. */ if ((rt->rt_flags & RTF_GATEWAY) || (rt->rt_flags & RTF_LLINFO) == 0 || rt->rt_gateway->sa_family != AF_LINK || rt->rt_llinfo == NULL || (ifp != NULL && rt->rt_ifidx != ifp->if_index)) { if (create) { char addr[INET6_ADDRSTRLEN]; nd6log((LOG_DEBUG, "%s: failed to lookup %s (if=%s)\n", __func__, inet_ntop(AF_INET6, addr6, addr, sizeof(addr)), ifp ? ifp->if_xname : "unspec")); } rtfree(rt); return (NULL); } return (rt); } /* * Detect if a given IPv6 address identifies a neighbor on a given link. * XXX: should take care of the destination of a p2p link? */ int nd6_is_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) { struct in6_ifaddr *ia6; struct ifaddr *ifa; struct rtentry *rt; /* * A link-local address is always a neighbor. * XXX: we should use the sin6_scope_id field rather than the embedded * interface index. * XXX: a link does not necessarily specify a single interface. */ if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr) && ntohs(*(u_int16_t *)&addr->sin6_addr.s6_addr[2]) == ifp->if_index) return (1); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia6 = ifatoia6(ifa); /* Prefix check down below. */ if (ia6->ia6_flags & IN6_IFF_AUTOCONF) continue; if (IN6_ARE_MASKED_ADDR_EQUAL(&addr->sin6_addr, &ia6->ia_addr.sin6_addr, &ia6->ia_prefixmask.sin6_addr)) return (1); } /* * Even if the address matches none of our addresses, it might be * in the neighbor cache. */ rt = nd6_lookup(&addr->sin6_addr, 0, ifp, ifp->if_rdomain); if (rt != NULL) { rtfree(rt); return (1); } return (0); } void nd6_invalidate(struct rtentry *rt) { struct llinfo_nd6 *ln = (struct llinfo_nd6 *)rt->rt_llinfo; struct sockaddr_dl *sdl = satosdl(rt->rt_gateway); m_freem(ln->ln_hold); sdl->sdl_alen = 0; ln->ln_hold = NULL; ln->ln_state = ND6_LLINFO_INCOMPLETE; ln->ln_asked = 0; } /* * Free an nd6 llinfo entry. */ void nd6_free(struct rtentry *rt) { struct llinfo_nd6 *ln = (struct llinfo_nd6 *)rt->rt_llinfo; struct in6_addr in6 = satosin6(rt_key(rt))->sin6_addr; struct ifnet *ifp; NET_ASSERT_LOCKED(); ifp = if_get(rt->rt_ifidx); if (!ip6_forwarding) { if (ln->ln_router) { /* * rt6_flush must be called whether or not the neighbor * is in the Default Router List. * See a corresponding comment in nd6_na_input(). */ rt6_flush(&in6, ifp); } } KASSERT(!ISSET(rt->rt_flags, RTF_LOCAL)); nd6_invalidate(rt); /* * Detach the route from the routing tree and the list of neighbor * caches, and disable the route entry not to be used in already * cached routes. */ if (!ISSET(rt->rt_flags, RTF_STATIC|RTF_CACHED)) rtdeletemsg(rt, ifp, ifp->if_rdomain); if_put(ifp); } /* * Upper-layer reachability hint for Neighbor Unreachability Detection. * * XXX cost-effective methods? */ void nd6_nud_hint(struct rtentry *rt) { struct llinfo_nd6 *ln; struct ifnet *ifp; ifp = if_get(rt->rt_ifidx); if (ifp == NULL) return; if ((rt->rt_flags & RTF_GATEWAY) != 0 || (rt->rt_flags & RTF_LLINFO) == 0 || rt->rt_llinfo == NULL || rt->rt_gateway == NULL || rt->rt_gateway->sa_family != AF_LINK) { /* This is not a host route. */ goto out; } ln = (struct llinfo_nd6 *)rt->rt_llinfo; if (ln->ln_state < ND6_LLINFO_REACHABLE) goto out; /* * if we get upper-layer reachability confirmation many times, * it is possible we have false information. */ ln->ln_byhint++; if (ln->ln_byhint > nd6_maxnudhint) goto out; ln->ln_state = ND6_LLINFO_REACHABLE; if (!ND6_LLINFO_PERMANENT(ln)) nd6_llinfo_settimer(ln, ND_IFINFO(ifp)->reachable); out: if_put(ifp); } void nd6_rtrequest(struct ifnet *ifp, int req, struct rtentry *rt) { struct sockaddr *gate = rt->rt_gateway; struct llinfo_nd6 *ln = (struct llinfo_nd6 *)rt->rt_llinfo; struct ifaddr *ifa; struct in6_ifaddr *ifa6; if (ISSET(rt->rt_flags, RTF_GATEWAY|RTF_MULTICAST|RTF_MPLS)) return; if (nd6_need_cache(ifp) == 0 && (rt->rt_flags & RTF_HOST) == 0) { /* * This is probably an interface direct route for a link * which does not need neighbor caches (e.g. fe80::%lo0/64). * We do not need special treatment below for such a route. * Moreover, the RTF_LLINFO flag which would be set below * would annoy the ndp(8) command. */ return; } if (req == RTM_RESOLVE && nd6_need_cache(ifp) == 0) { /* * For routing daemons like ospf6d we allow neighbor discovery * based on the cloning route only. This allows us to sent * packets directly into a network without having an address * with matching prefix on the interface. If the cloning * route is used for an stf interface, we would mistakenly * make a neighbor cache for the host route, and would see * strange neighbor solicitation for the corresponding * destination. In order to avoid confusion, we check if the * interface is suitable for neighbor discovery, and stop the * process if not. Additionally, we remove the LLINFO flag * so that ndp(8) will not try to get the neighbor information * of the destination. */ rt->rt_flags &= ~RTF_LLINFO; return; } switch (req) { case RTM_ADD: if ((rt->rt_flags & RTF_CLONING) || ((rt->rt_flags & (RTF_LLINFO | RTF_LOCAL)) && ln == NULL)) { if (ln != NULL) nd6_llinfo_settimer(ln, 0); if ((rt->rt_flags & RTF_CLONING) != 0) break; } /* * In IPv4 code, we try to announce new RTF_ANNOUNCE entry here. * We don't do that here since llinfo is not ready yet. * * There are also couple of other things to be discussed: * - unsolicited NA code needs improvement beforehand * - RFC2461 says we MAY send multicast unsolicited NA * (7.2.6 paragraph 4), however, it also says that we * SHOULD provide a mechanism to prevent multicast NA storm. * we don't have anything like it right now. * note that the mechanism needs a mutual agreement * between proxies, which means that we need to implement * a new protocol, or a new kludge. * - from RFC2461 6.2.4, host MUST NOT send an unsolicited NA. * we need to check ip6forwarding before sending it. * (or should we allow proxy ND configuration only for * routers? there's no mention about proxy ND from hosts) */ #if 0 /* XXX it does not work */ if (rt->rt_flags & RTF_ANNOUNCE) nd6_na_output(ifp, &satosin6(rt_key(rt))->sin6_addr, &satosin6(rt_key(rt))->sin6_addr, ip6_forwarding ? ND_NA_FLAG_ROUTER : 0, 1, NULL); #endif /* FALLTHROUGH */ case RTM_RESOLVE: if (gate->sa_family != AF_LINK || gate->sa_len < sizeof(struct sockaddr_dl)) { log(LOG_DEBUG, "%s: bad gateway value: %s\n", __func__, ifp->if_xname); break; } satosdl(gate)->sdl_type = ifp->if_type; satosdl(gate)->sdl_index = ifp->if_index; if (ln != NULL) break; /* This happens on a route change */ /* * Case 2: This route may come from cloning, or a manual route * add with a LL address. */ ln = pool_get(&nd6_pool, PR_NOWAIT | PR_ZERO); rt->rt_llinfo = (caddr_t)ln; if (ln == NULL) { log(LOG_DEBUG, "%s: pool get failed\n", __func__); break; } nd6_inuse++; ln->ln_rt = rt; /* this is required for "ndp" command. - shin */ if (req == RTM_ADD) { /* * gate should have some valid AF_LINK entry, * and ln expire should have some lifetime * which is specified by ndp command. */ ln->ln_state = ND6_LLINFO_REACHABLE; ln->ln_byhint = 0; } else { /* * When req == RTM_RESOLVE, rt is created and * initialized in rtrequest(), so rt_expire is 0. */ ln->ln_state = ND6_LLINFO_NOSTATE; nd6_llinfo_settimer(ln, 0); } rt->rt_flags |= RTF_LLINFO; TAILQ_INSERT_HEAD(&nd6_list, ln, ln_list); /* * If we have too many cache entries, initiate immediate * purging for some "less recently used" entries. Note that * we cannot directly call nd6_free() here because it would * cause re-entering rtable related routines triggering an LOR * problem for FreeBSD. */ if (ip6_neighborgcthresh >= 0 && nd6_inuse >= ip6_neighborgcthresh) { int i; for (i = 0; i < 10; i++) { struct llinfo_nd6 *ln_end; ln_end = TAILQ_LAST(&nd6_list, llinfo_nd6_head); if (ln_end == ln) break; /* Move this entry to the head */ TAILQ_REMOVE(&nd6_list, ln_end, ln_list); TAILQ_INSERT_HEAD(&nd6_list, ln_end, ln_list); if (ND6_LLINFO_PERMANENT(ln_end)) continue; if (ln_end->ln_state > ND6_LLINFO_INCOMPLETE) ln_end->ln_state = ND6_LLINFO_STALE; else ln_end->ln_state = ND6_LLINFO_PURGE; nd6_llinfo_settimer(ln_end, 0); } } /* * check if rt_key(rt) is one of my address assigned * to the interface. */ ifa6 = in6ifa_ifpwithaddr(ifp, &satosin6(rt_key(rt))->sin6_addr); ifa = ifa6 ? &ifa6->ia_ifa : NULL; if (ifa) { ln->ln_state = ND6_LLINFO_REACHABLE; ln->ln_byhint = 0; rt->rt_expire = 0; KASSERT(ifa == rt->rt_ifa); } else if (rt->rt_flags & RTF_ANNOUNCE) { ln->ln_state = ND6_LLINFO_REACHABLE; ln->ln_byhint = 0; rt->rt_expire = 0; /* join solicited node multicast for proxy ND */ if (ifp->if_flags & IFF_MULTICAST) { struct in6_addr llsol; int error; llsol = satosin6(rt_key(rt))->sin6_addr; llsol.s6_addr16[0] = htons(0xff02); llsol.s6_addr16[1] = htons(ifp->if_index); llsol.s6_addr32[1] = 0; llsol.s6_addr32[2] = htonl(1); llsol.s6_addr8[12] = 0xff; if (in6_addmulti(&llsol, ifp, &error)) { char addr[INET6_ADDRSTRLEN]; nd6log((LOG_ERR, "%s: failed to join " "%s (errno=%d)\n", ifp->if_xname, inet_ntop(AF_INET6, &llsol, addr, sizeof(addr)), error)); } } } break; case RTM_DELETE: if (ln == NULL) break; /* leave from solicited node multicast for proxy ND */ if ((rt->rt_flags & RTF_ANNOUNCE) != 0 && (ifp->if_flags & IFF_MULTICAST) != 0) { struct in6_addr llsol; struct in6_multi *in6m; llsol = satosin6(rt_key(rt))->sin6_addr; llsol.s6_addr16[0] = htons(0xff02); llsol.s6_addr16[1] = htons(ifp->if_index); llsol.s6_addr32[1] = 0; llsol.s6_addr32[2] = htonl(1); llsol.s6_addr8[12] = 0xff; IN6_LOOKUP_MULTI(llsol, ifp, in6m); if (in6m) in6_delmulti(in6m); } nd6_inuse--; TAILQ_REMOVE(&nd6_list, ln, ln_list); rt->rt_expire = 0; rt->rt_llinfo = NULL; rt->rt_flags &= ~RTF_LLINFO; m_freem(ln->ln_hold); pool_put(&nd6_pool, ln); break; case RTM_INVALIDATE: if (ln == NULL) break; if (!ISSET(rt->rt_flags, RTF_LOCAL)) nd6_invalidate(rt); break; } } int nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) { struct in6_ndireq *ndi = (struct in6_ndireq *)data; struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data; struct rtentry *rt; switch (cmd) { case SIOCGIFINFO_IN6: NET_LOCK_SHARED(); ndi->ndi = *ND_IFINFO(ifp); NET_UNLOCK_SHARED(); return (0); case SIOCGNBRINFO_IN6: { struct llinfo_nd6 *ln; struct in6_addr nb_addr = nbi->addr; /* make local for safety */ time_t expire; NET_LOCK_SHARED(); /* * XXX: KAME specific hack for scoped addresses * XXXX: for other scopes than link-local? */ if (IN6_IS_ADDR_LINKLOCAL(&nbi->addr) || IN6_IS_ADDR_MC_LINKLOCAL(&nbi->addr)) { u_int16_t *idp = (u_int16_t *)&nb_addr.s6_addr[2]; if (*idp == 0) *idp = htons(ifp->if_index); } rt = nd6_lookup(&nb_addr, 0, ifp, ifp->if_rdomain); if (rt == NULL || (ln = (struct llinfo_nd6 *)rt->rt_llinfo) == NULL) { rtfree(rt); NET_UNLOCK_SHARED(); return (EINVAL); } expire = ln->ln_rt->rt_expire; if (expire != 0) { expire -= getuptime(); expire += gettime(); } nbi->state = ln->ln_state; nbi->asked = ln->ln_asked; nbi->isrouter = ln->ln_router; nbi->expire = expire; rtfree(rt); NET_UNLOCK_SHARED(); return (0); } } return (0); } /* * Create neighbor cache entry and cache link-layer address, * on reception of inbound ND6 packets. (RS/RA/NS/redirect) * * type - ICMP6 type * code - type dependent information */ void nd6_cache_lladdr(struct ifnet *ifp, const struct in6_addr *from, char *lladdr, int lladdrlen, int type, int code) { struct rtentry *rt = NULL; struct llinfo_nd6 *ln = NULL; int is_newentry; struct sockaddr_dl *sdl = NULL; int do_update; int olladdr; int llchange; int newstate = 0; if (!ifp) panic("%s: ifp == NULL", __func__); if (!from) panic("%s: from == NULL", __func__); /* nothing must be updated for unspecified address */ if (IN6_IS_ADDR_UNSPECIFIED(from)) return; /* * Validation about ifp->if_addrlen and lladdrlen must be done in * the caller. * * XXX If the link does not have link-layer address, what should * we do? (ifp->if_addrlen == 0) * Spec says nothing in sections for RA, RS and NA. There's small * description on it in NS section (RFC 2461 7.2.3). */ rt = nd6_lookup(from, 0, ifp, ifp->if_rdomain); if (rt == NULL) { rt = nd6_lookup(from, 1, ifp, ifp->if_rdomain); is_newentry = 1; } else { /* do not overwrite local or static entry */ if (ISSET(rt->rt_flags, RTF_STATIC|RTF_LOCAL)) { rtfree(rt); return; } is_newentry = 0; } if (!rt) return; if ((rt->rt_flags & (RTF_GATEWAY | RTF_LLINFO)) != RTF_LLINFO) { fail: nd6_free(rt); rtfree(rt); return; } ln = (struct llinfo_nd6 *)rt->rt_llinfo; if (ln == NULL) goto fail; if (rt->rt_gateway == NULL) goto fail; if (rt->rt_gateway->sa_family != AF_LINK) goto fail; sdl = satosdl(rt->rt_gateway); olladdr = (sdl->sdl_alen) ? 1 : 0; if (olladdr && lladdr) { if (bcmp(lladdr, LLADDR(sdl), ifp->if_addrlen)) llchange = 1; else llchange = 0; } else llchange = 0; /* * newentry olladdr lladdr llchange (*=record) * 0 n n -- (1) * 0 y n -- (2) * 0 n y -- (3) * STALE * 0 y y n (4) * * 0 y y y (5) * STALE * 1 -- n -- (6) NOSTATE(= PASSIVE) * 1 -- y -- (7) * STALE */ if (llchange) { char addr[INET6_ADDRSTRLEN]; log(LOG_INFO, "ndp info overwritten for %s by %s on %s\n", inet_ntop(AF_INET6, from, addr, sizeof(addr)), ether_sprintf(lladdr), ifp->if_xname); } if (lladdr) { /* (3-5) and (7) */ /* * Record source link-layer address * XXX is it dependent to ifp->if_type? */ sdl->sdl_alen = ifp->if_addrlen; bcopy(lladdr, LLADDR(sdl), ifp->if_addrlen); } if (!is_newentry) { if ((!olladdr && lladdr) || /* (3) */ (olladdr && lladdr && llchange)) { /* (5) */ do_update = 1; newstate = ND6_LLINFO_STALE; } else /* (1-2,4) */ do_update = 0; } else { do_update = 1; if (!lladdr) /* (6) */ newstate = ND6_LLINFO_NOSTATE; else /* (7) */ newstate = ND6_LLINFO_STALE; } if (do_update) { /* * Update the state of the neighbor cache. */ ln->ln_state = newstate; if (ln->ln_state == ND6_LLINFO_STALE) { /* * Since nd6_resolve() in ifp->if_output() will cause * state transition to DELAY and reset the timer, * we must set the timer now, although it is actually * meaningless. */ nd6_llinfo_settimer(ln, nd6_gctimer); if (ln->ln_hold) { struct mbuf *n = ln->ln_hold; ln->ln_hold = NULL; /* * we assume ifp is not a p2p here, so just * set the 2nd argument as the 1st one. */ ifp->if_output(ifp, n, rt_key(rt), rt); if (ln->ln_hold == n) { /* n is back in ln_hold. Discard. */ m_freem(ln->ln_hold); ln->ln_hold = NULL; } } } else if (ln->ln_state == ND6_LLINFO_INCOMPLETE) { /* probe right away */ nd6_llinfo_settimer(ln, 0); } } /* * ICMP6 type dependent behavior. * * NS: clear IsRouter if new entry * RS: clear IsRouter * RA: set IsRouter if there's lladdr * redir: clear IsRouter if new entry * * RA case, (1): * The spec says that we must set IsRouter in the following cases: * - If lladdr exist, set IsRouter. This means (1-5). * - If it is old entry (!newentry), set IsRouter. This means (7). * So, based on the spec, in (1-5) and (7) cases we must set IsRouter. * A question arises for (1) case. (1) case has no lladdr in the * neighbor cache, this is similar to (6). * This case is rare but we figured that we MUST NOT set IsRouter. * * newentry olladdr lladdr llchange NS RS RA redir * D R * 0 n n -- (1) c ? s * 0 y n -- (2) c s s * 0 n y -- (3) c s s * 0 y y n (4) c s s * 0 y y y (5) c s s * 1 -- n -- (6) c c c s * 1 -- y -- (7) c c s c s * * (c=clear s=set) */ switch (type & 0xff) { case ND_NEIGHBOR_SOLICIT: /* * New entry must have is_router flag cleared. */ if (is_newentry) /* (6-7) */ ln->ln_router = 0; break; case ND_REDIRECT: /* * If the icmp is a redirect to a better router, always set the * is_router flag. Otherwise, if the entry is newly created, * clear the flag. [RFC 2461, sec 8.3] */ if (code == ND_REDIRECT_ROUTER) ln->ln_router = 1; else if (is_newentry) /* (6-7) */ ln->ln_router = 0; break; case ND_ROUTER_SOLICIT: /* * is_router flag must always be cleared. */ ln->ln_router = 0; break; case ND_ROUTER_ADVERT: /* * Mark an entry with lladdr as a router. */ if ((!is_newentry && (olladdr || lladdr)) || /* (2-5) */ (is_newentry && lladdr)) { /* (7) */ ln->ln_router = 1; } break; } rtfree(rt); } void nd6_slowtimo(void *ignored_arg) { struct nd_ifinfo *nd6if; struct ifnet *ifp; NET_LOCK(); timeout_add_sec(&nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL); TAILQ_FOREACH(ifp, &ifnet, if_list) { nd6if = ND_IFINFO(ifp); if (nd6if->basereachable && /* already initialized */ (nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) { /* * Since reachable time rarely changes by router * advertisements, we SHOULD insure that a new random * value gets recomputed at least once every few hours. * (RFC 2461, 6.3.4) */ nd6if->recalctm = ND6_RECALC_REACHTM_INTERVAL; nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable); } } NET_UNLOCK(); } int nd6_resolve(struct ifnet *ifp, struct rtentry *rt0, struct mbuf *m, struct sockaddr *dst, u_char *desten) { struct sockaddr_dl *sdl; struct rtentry *rt; struct llinfo_nd6 *ln = NULL; if (m->m_flags & M_MCAST) { ETHER_MAP_IPV6_MULTICAST(&satosin6(dst)->sin6_addr, desten); return (0); } rt = rt_getll(rt0); if (ISSET(rt->rt_flags, RTF_REJECT) && (rt->rt_expire == 0 || getuptime() < rt->rt_expire)) { m_freem(m); return (rt == rt0 ? EHOSTDOWN : EHOSTUNREACH); } /* * Address resolution or Neighbor Unreachability Detection * for the next hop. * At this point, the destination of the packet must be a unicast * or an anycast address(i.e. not a multicast). */ if (!ISSET(rt->rt_flags, RTF_LLINFO)) { char addr[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "%s: %s: route contains no ND information\n", __func__, inet_ntop(AF_INET6, &satosin6(rt_key(rt))->sin6_addr, addr, sizeof(addr))); m_freem(m); return (EINVAL); } if (rt->rt_gateway->sa_family != AF_LINK) { printf("%s: something odd happens\n", __func__); m_freem(m); return (EINVAL); } ln = (struct llinfo_nd6 *)rt->rt_llinfo; KASSERT(ln != NULL); /* * Move this entry to the head of the queue so that it is less likely * for this entry to be a target of forced garbage collection (see * nd6_rtrequest()). */ TAILQ_REMOVE(&nd6_list, ln, ln_list); TAILQ_INSERT_HEAD(&nd6_list, ln, ln_list); /* * The first time we send a packet to a neighbor whose entry is * STALE, we have to change the state to DELAY and a sets a timer to * expire in DELAY_FIRST_PROBE_TIME seconds to ensure do * neighbor unreachability detection on expiration. * (RFC 2461 7.3.3) */ if (ln->ln_state == ND6_LLINFO_STALE) { ln->ln_asked = 0; ln->ln_state = ND6_LLINFO_DELAY; nd6_llinfo_settimer(ln, nd6_delay); } /* * If the neighbor cache entry has a state other than INCOMPLETE * (i.e. its link-layer address is already resolved), just * send the packet. */ if (ln->ln_state > ND6_LLINFO_INCOMPLETE) { sdl = satosdl(rt->rt_gateway); if (sdl->sdl_alen != ETHER_ADDR_LEN) { char addr[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "%s: %s: incorrect nd6 information\n", __func__, inet_ntop(AF_INET6, &satosin6(dst)->sin6_addr, addr, sizeof(addr))); m_freem(m); return (EINVAL); } bcopy(LLADDR(sdl), desten, sdl->sdl_alen); return (0); } /* * There is a neighbor cache entry, but no ethernet address * response yet. Replace the held mbuf (if any) with this * latest one. */ if (ln->ln_state == ND6_LLINFO_NOSTATE) ln->ln_state = ND6_LLINFO_INCOMPLETE; m_freem(ln->ln_hold); ln->ln_hold = m; /* * If there has been no NS for the neighbor after entering the * INCOMPLETE state, send the first solicitation. */ if (!ND6_LLINFO_PERMANENT(ln) && ln->ln_asked == 0) { ln->ln_asked++; nd6_llinfo_settimer(ln, ND_IFINFO(ifp)->retrans / 1000); nd6_ns_output(ifp, NULL, &satosin6(dst)->sin6_addr, ln, 0); } return (EAGAIN); } int nd6_need_cache(struct ifnet *ifp) { /* * RFC2893 says: * - unidirectional tunnels needs no ND */ switch (ifp->if_type) { case IFT_ETHER: case IFT_IEEE80211: case IFT_CARP: return (1); default: return (0); } }
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 /* $OpenBSD: mpls_input.c,v 1.78 2021/07/22 11:07:17 mvs Exp $ */ /* * Copyright (c) 2008 Claudio Jeker <claudio@openbsd.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <sys/param.h> #include <sys/mbuf.h> #include <sys/systm.h> #include <sys/socket.h> #include <net/if.h> #include <net/if_var.h> #include <net/if_types.h> #include <net/netisr.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/ip_icmp.h> #ifdef INET6 #include <netinet/ip6.h> #endif /* INET6 */ #include <netmpls/mpls.h> #ifdef MPLS_DEBUG #define MPLS_LABEL_GET(l) ((ntohl((l) & MPLS_LABEL_MASK)) >> MPLS_LABEL_OFFSET) #define MPLS_TTL_GET(l) (ntohl((l) & MPLS_TTL_MASK)) #endif struct mbuf *mpls_do_error(struct mbuf *, int, int, int); void mpls_input_local(struct rtentry *, struct mbuf *); void mpls_input(struct ifnet *ifp, struct mbuf *m) { struct sockaddr_mpls *smpls; struct sockaddr_mpls sa_mpls; struct shim_hdr *shim; struct rtentry *rt; struct rt_mpls *rt_mpls; uint8_t ttl; int hasbos; if (!ISSET(ifp->if_xflags, IFXF_MPLS)) { m_freem(m); return; } /* drop all broadcast and multicast packets */ if (m->m_flags & (M_BCAST | M_MCAST)) { m_freem(m); return; } if (m->m_len < sizeof(*shim)) { m = m_pullup(m, sizeof(*shim)); if (m == NULL) return; } shim = mtod(m, struct shim_hdr *); #ifdef MPLS_DEBUG printf("mpls_input: iface %s label=%d, ttl=%d BoS %d\n", ifp->if_xname, MPLS_LABEL_GET(shim->shim_label), MPLS_TTL_GET(shim->shim_label), MPLS_BOS_ISSET(shim->shim_label)); #endif /* check and decrement TTL */ ttl = ntohl(shim->shim_label & MPLS_TTL_MASK); if (ttl <= 1) { /* TTL exceeded */ m = mpls_do_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0); if (m == NULL) return; shim = mtod(m, struct shim_hdr *); ttl = ntohl(shim->shim_label & MPLS_TTL_MASK); } else ttl--; hasbos = MPLS_BOS_ISSET(shim->shim_label); bzero(&sa_mpls, sizeof(sa_mpls)); smpls = &sa_mpls; smpls->smpls_family = AF_MPLS; smpls->smpls_len = sizeof(*smpls); smpls->smpls_label = shim->shim_label & MPLS_LABEL_MASK; if (ntohl(smpls->smpls_label) < MPLS_LABEL_RESERVED_MAX) { m = mpls_shim_pop(m); if (m == NULL) return; if (!hasbos) { /* * RFC 4182 relaxes the position of the * explicit NULL labels. They no longer need * to be at the beginning of the stack. * In this case the label is ignored and the decision * is made based on the lower one. */ shim = mtod(m, struct shim_hdr *); smpls->smpls_label = shim->shim_label & MPLS_LABEL_MASK; hasbos = MPLS_BOS_ISSET(shim->shim_label); } else { switch (ntohl(smpls->smpls_label)) { case MPLS_LABEL_IPV4NULL: do_v4: if (mpls_mapttl_ip) { m = mpls_ip_adjttl(m, ttl); if (m == NULL) return; } ipv4_input(ifp, m); return; #ifdef INET6 case MPLS_LABEL_IPV6NULL: do_v6: if (mpls_mapttl_ip6) { m = mpls_ip6_adjttl(m, ttl); if (m == NULL) return; } ipv6_input(ifp, m); return; #endif /* INET6 */ case MPLS_LABEL_IMPLNULL: if (m->m_len < sizeof(u_char) && (m = m_pullup(m, sizeof(u_char))) == NULL) return; switch (*mtod(m, u_char *) >> 4) { case IPVERSION: goto do_v4; #ifdef INET6 case IPV6_VERSION >> 4: goto do_v6; #endif default: m_freem(m); return; } default: /* Other cases are not handled for now */ m_freem(m); return; } } } ifp = NULL; rt = rtalloc(smplstosa(smpls), RT_RESOLVE, m->m_pkthdr.ph_rtableid); if (!rtisvalid(rt)) { /* no entry for this label */ #ifdef MPLS_DEBUG printf("MPLS_DEBUG: label not found\n"); #endif m_freem(m); goto done; } rt_mpls = (struct rt_mpls *)rt->rt_llinfo; if (rt_mpls == NULL || (rt->rt_flags & RTF_MPLS) == 0) { #ifdef MPLS_DEBUG printf("MPLS_DEBUG: no MPLS information attached\n"); #endif m_freem(m); goto done; } switch (rt_mpls->mpls_operation) { case MPLS_OP_POP: if (ISSET(rt->rt_flags, RTF_LOCAL)) { mpls_input_local(rt, m); goto done; } m = mpls_shim_pop(m); if (m == NULL) goto done; if (!hasbos) /* just forward to gw */ break; /* last label popped so decide where to push it to */ ifp = if_get(rt->rt_ifidx); if (ifp == NULL) { m_freem(m); goto done; } KASSERT(rt->rt_gateway); switch(rt->rt_gateway->sa_family) { case AF_INET: if ((m = mpls_ip_adjttl(m, ttl)) == NULL) goto done; break; #ifdef INET6 case AF_INET6: if ((m = mpls_ip6_adjttl(m, ttl)) == NULL) goto done; break; #endif case AF_LINK: break; default: m_freem(m); goto done; } /* shortcut sending out the packet */ if (!ISSET(ifp->if_xflags, IFXF_MPLS)) (*ifp->if_output)(ifp, m, rt->rt_gateway, rt); else (*ifp->if_ll_output)(ifp, m, rt->rt_gateway, rt); goto done; case MPLS_OP_PUSH: /* this does not make much sense but it does not hurt */ m = mpls_shim_push(m, rt_mpls); break; case MPLS_OP_SWAP: m = mpls_shim_swap(m, rt_mpls); break; default: m_freem(m); goto done; } if (m == NULL) goto done; /* refetch label and write back TTL */ shim = mtod(m, struct shim_hdr *); shim->shim_label = (shim->shim_label & ~MPLS_TTL_MASK) | htonl(ttl); ifp = if_get(rt->rt_ifidx); if (ifp == NULL) { m_freem(m); goto done; } #ifdef MPLS_DEBUG printf("MPLS: sending on %s outlabel %x dst af %d in %d out %d\n", ifp->if_xname, ntohl(shim->shim_label), smpls->smpls_family, MPLS_LABEL_GET(smpls->smpls_label), MPLS_LABEL_GET(rt_mpls->mpls_label)); #endif /* Output iface is not MPLS-enabled */ if (!ISSET(ifp->if_xflags, IFXF_MPLS)) { #ifdef MPLS_DEBUG printf("MPLS_DEBUG: interface %s not mpls enabled\n", ifp->if_xname); #endif m_freem(m); goto done; } (*ifp->if_ll_output)(ifp, m, smplstosa(smpls), rt); done: if_put(ifp); rtfree(rt); } void mpls_input_local(struct rtentry *rt, struct mbuf *m) { struct ifnet *ifp; ifp = if_get(rt->rt_ifidx); if (ifp == NULL) { m_freem(m); return; } /* shortcut sending out the packet */ if (!ISSET(ifp->if_xflags, IFXF_MPLS)) (*ifp->if_output)(ifp, m, rt->rt_gateway, rt); else (*ifp->if_ll_output)(ifp, m, rt->rt_gateway, rt); if_put(ifp); } struct mbuf * mpls_ip_adjttl(struct mbuf *m, u_int8_t ttl) { struct ip *ip; uint16_t old, new; uint32_t x; if (m->m_len < sizeof(*ip)) { m = m_pullup(m, sizeof(*ip)); if (m == NULL) return (NULL); } ip = mtod(m, struct ip *); old = htons(ip->ip_ttl << 8); new = htons(ttl << 8); x = ip->ip_sum + old - new; ip->ip_ttl = ttl; /* see pf_cksum_fixup() */ ip->ip_sum = (x) + (x >> 16); return (m); } #ifdef INET6 struct mbuf * mpls_ip6_adjttl(struct mbuf *m, u_int8_t ttl) { struct ip6_hdr *ip6; if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (m == NULL) return (NULL); } ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_hlim = ttl; return (m); } #endif /* INET6 */ struct mbuf * mpls_do_error(struct mbuf *m, int type, int code, int destmtu) { struct shim_hdr stack[MPLS_INKERNEL_LOOP_MAX]; struct sockaddr_mpls sa_mpls; struct sockaddr_mpls *smpls; struct rtentry *rt = NULL; struct shim_hdr *shim; struct in_ifaddr *ia; struct icmp *icp; struct ip *ip; int nstk, error; for (nstk = 0; nstk < MPLS_INKERNEL_LOOP_MAX; nstk++) { if (m->m_len < sizeof(*shim) && (m = m_pullup(m, sizeof(*shim))) == NULL) return (NULL); stack[nstk] = *mtod(m, struct shim_hdr *); m_adj(m, sizeof(*shim)); if (MPLS_BOS_ISSET(stack[nstk].shim_label)) break; } shim = &stack[0]; if (m->m_len < sizeof(u_char) && (m = m_pullup(m, sizeof(u_char))) == NULL) return (NULL); switch (*mtod(m, u_char *) >> 4) { case IPVERSION: if (m->m_len < sizeof(*ip) && (m = m_pullup(m, sizeof(*ip))) == NULL) return (NULL); m = icmp_do_error(m, type, code, 0, destmtu); if (m == NULL) return (NULL); if (icmp_do_exthdr(m, ICMP_EXT_MPLS, 1, stack, (nstk + 1) * sizeof(*shim))) return (NULL); /* set ip_src to something usable, based on the MPLS label */ bzero(&sa_mpls, sizeof(sa_mpls)); smpls = &sa_mpls; smpls->smpls_family = AF_MPLS; smpls->smpls_len = sizeof(*smpls); smpls->smpls_label = shim->shim_label & MPLS_LABEL_MASK; rt = rtalloc(smplstosa(smpls), RT_RESOLVE, 0); if (!rtisvalid(rt)) { rtfree(rt); /* no entry for this label */ m_freem(m); return (NULL); } if (rt->rt_ifa->ifa_addr->sa_family == AF_INET) ia = ifatoia(rt->rt_ifa); else { /* XXX this needs fixing, if the MPLS is on an IP * less interface we need to find some other IP to * use as source. */ rtfree(rt); m_freem(m); return (NULL); } /* It is safe to dereference ``ia'' iff ``rt'' is valid. */ error = icmp_reflect(m, NULL, ia); rtfree(rt); if (error) return (NULL); ip = mtod(m, struct ip *); /* stuff to fix up which is normally done in ip_output */ ip->ip_v = IPVERSION; ip->ip_id = htons(ip_randomid()); ip->ip_sum = 0; ip->ip_sum = in_cksum(m, sizeof(*ip)); /* stolen from icmp_send() */ icp = (struct icmp *)(mtod(m, caddr_t) + sizeof(*ip)); icp->icmp_cksum = 0; icp->icmp_cksum = in4_cksum(m, 0, sizeof(*ip), ntohs(ip->ip_len) - sizeof(*ip)); break; #ifdef INET6 case IPV6_VERSION >> 4: #endif default: m_freem(m); return (NULL); } /* add mpls stack back to new packet */ M_PREPEND(m, (nstk + 1) * sizeof(*shim), M_NOWAIT); if (m == NULL) return (NULL); m_copyback(m, 0, (nstk + 1) * sizeof(*shim), stack, M_NOWAIT); /* change TTL to default */ shim = mtod(m, struct shim_hdr *); shim->shim_label = (shim->shim_label & ~MPLS_TTL_MASK) | htonl(mpls_defttl); return (m); }
187 175 15 15 15 219 23 8 10 3 3 22 22 22 12 12 5 7 3 3 4 4 4 24 24 11 6 6 1 28 24 17 5 30 9 15 4 13 13 13 2 16 2 217 47 63 28 77 49 45 6 245 180 28 183 48 147 147 184 181 180 5 5 4 4 139 2 105 2 33 33 76 112 97 20 113 93 8 54 34 5 24 24 7 81 218 218 144 217 218 265 266 91 17 54 248 47 257 20 38 12 234 30 43 17 49 245 81 2 25 36 21 62 223 168 1 29 60 10 220 41 229 16 232 88 231 73 50 22 14 8 171 43 85 49 130 3 3 131 131 81 1 4 126 69 2 65 1 103 33 131 143 176 47 131 1 1 8 16 16 8 8 15 15 15 2 16 186 42 189 13 20 152 188 188 1 79 249 250 1 114 173 124 125 126 31 48 47 11 25 8 3 16 7 12 7 231 229 231 26 44 242 219 47 134 117 33 133 133 119 22 134 36 115 173 23 23 40 45 22 84 154 63 118 112 17 82 121 120 13 179 182 34 44 17 20 20 68 68 25 25 102 13 13 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 /* $OpenBSD: kern_event.c,v 1.193 2022/08/14 01:58:27 jsg Exp $ */ /*- * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD: src/sys/kern/kern_event.c,v 1.22 2001/02/23 20:32:42 jlemon Exp $ */ #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/pledge.h> #include <sys/malloc.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/fcntl.h> #include <sys/queue.h> #include <sys/event.h> #include <sys/eventvar.h> #include <sys/ktrace.h> #include <sys/pool.h> #include <sys/stat.h> #include <sys/mount.h> #include <sys/syscallargs.h> #include <sys/time.h> #include <sys/timeout.h> #include <sys/vnode.h> #include <sys/wait.h> #ifdef DIAGNOSTIC #define KLIST_ASSERT_LOCKED(kl) do { \ if ((kl)->kl_ops != NULL) \ (kl)->kl_ops->klo_assertlk((kl)->kl_arg); \ else \ KERNEL_ASSERT_LOCKED(); \ } while (0) #else #define KLIST_ASSERT_LOCKED(kl) ((void)(kl)) #endif struct kqueue *kqueue_alloc(struct filedesc *); void kqueue_terminate(struct proc *p, struct kqueue *); void KQREF(struct kqueue *); void KQRELE(struct kqueue *); void kqueue_purge(struct proc *, struct kqueue *); int kqueue_sleep(struct kqueue *, struct timespec *); int kqueue_read(struct file *, struct uio *, int); int kqueue_write(struct file *, struct uio *, int); int kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p); int kqueue_kqfilter(struct file *fp, struct knote *kn); int kqueue_stat(struct file *fp, struct stat *st, struct proc *p); int kqueue_close(struct file *fp, struct proc *p); void kqueue_wakeup(struct kqueue *kq); #ifdef KQUEUE_DEBUG void kqueue_do_check(struct kqueue *kq, const char *func, int line); #define kqueue_check(kq) kqueue_do_check((kq), __func__, __LINE__) #else #define kqueue_check(kq) do {} while (0) #endif static int filter_attach(struct knote *kn); static void filter_detach(struct knote *kn); static int filter_event(struct knote *kn, long hint); static int filter_modify(struct kevent *kev, struct knote *kn); static int filter_process(struct knote *kn, struct kevent *kev); static void kqueue_expand_hash(struct kqueue *kq); static void kqueue_expand_list(struct kqueue *kq, int fd); static void kqueue_task(void *); static int klist_lock(struct klist *); static void klist_unlock(struct klist *, int); const struct fileops kqueueops = { .fo_read = kqueue_read, .fo_write = kqueue_write, .fo_ioctl = kqueue_ioctl, .fo_kqfilter = kqueue_kqfilter, .fo_stat = kqueue_stat, .fo_close = kqueue_close }; void knote_attach(struct knote *kn); void knote_detach(struct knote *kn); void knote_drop(struct knote *kn, struct proc *p); void knote_enqueue(struct knote *kn); void knote_dequeue(struct knote *kn); int knote_acquire(struct knote *kn, struct klist *, int); void knote_release(struct knote *kn); void knote_activate(struct knote *kn); void knote_remove(struct proc *p, struct kqueue *kq, struct knlist **plist, int idx, int purge); void filt_kqdetach(struct knote *kn); int filt_kqueue(struct knote *kn, long hint); int filt_kqueuemodify(struct kevent *kev, struct knote *kn); int filt_kqueueprocess(struct knote *kn, struct kevent *kev); int filt_kqueue_common(struct knote *kn, struct kqueue *kq); int filt_procattach(struct knote *kn); void filt_procdetach(struct knote *kn); int filt_proc(struct knote *kn, long hint); int filt_fileattach(struct knote *kn); void filt_timerexpire(void *knx); int filt_timerattach(struct knote *kn); void filt_timerdetach(struct knote *kn); int filt_timermodify(struct kevent *kev, struct knote *kn); int filt_timerprocess(struct knote *kn, struct kevent *kev); void filt_seltruedetach(struct knote *kn); const struct filterops kqread_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_kqdetach, .f_event = filt_kqueue, .f_modify = filt_kqueuemodify, .f_process = filt_kqueueprocess, }; const struct filterops proc_filtops = { .f_flags = 0, .f_attach = filt_procattach, .f_detach = filt_procdetach, .f_event = filt_proc, }; const struct filterops file_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = filt_fileattach, .f_detach = NULL, .f_event = NULL, }; const struct filterops timer_filtops = { .f_flags = 0, .f_attach = filt_timerattach, .f_detach = filt_timerdetach, .f_event = NULL, .f_modify = filt_timermodify, .f_process = filt_timerprocess, }; struct pool knote_pool; struct pool kqueue_pool; struct mutex kqueue_klist_lock = MUTEX_INITIALIZER(IPL_MPFLOOR); int kq_ntimeouts = 0; int kq_timeoutmax = (4 * 1024); #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) /* * Table for for all system-defined filters. */ const struct filterops *const sysfilt_ops[] = { &file_filtops, /* EVFILT_READ */ &file_filtops, /* EVFILT_WRITE */ NULL, /*&aio_filtops,*/ /* EVFILT_AIO */ &file_filtops, /* EVFILT_VNODE */ &proc_filtops, /* EVFILT_PROC */ &sig_filtops, /* EVFILT_SIGNAL */ &timer_filtops, /* EVFILT_TIMER */ &file_filtops, /* EVFILT_DEVICE */ &file_filtops, /* EVFILT_EXCEPT */ }; void KQREF(struct kqueue *kq) { refcnt_take(&kq->kq_refcnt); } void KQRELE(struct kqueue *kq) { struct filedesc *fdp; if (refcnt_rele(&kq->kq_refcnt) == 0) return; fdp = kq->kq_fdp; if (rw_status(&fdp->fd_lock) == RW_WRITE) { LIST_REMOVE(kq, kq_next); } else { fdplock(fdp); LIST_REMOVE(kq, kq_next); fdpunlock(fdp); } KASSERT(TAILQ_EMPTY(&kq->kq_head)); KASSERT(kq->kq_nknotes == 0); free(kq->kq_knlist, M_KEVENT, kq->kq_knlistsize * sizeof(struct knlist)); hashfree(kq->kq_knhash, KN_HASHSIZE, M_KEVENT); klist_free(&kq->kq_klist); pool_put(&kqueue_pool, kq); } void kqueue_init(void) { pool_init(&kqueue_pool, sizeof(struct kqueue), 0, IPL_MPFLOOR, PR_WAITOK, "kqueuepl", NULL); pool_init(&knote_pool, sizeof(struct knote), 0, IPL_MPFLOOR, PR_WAITOK, "knotepl", NULL); } void kqueue_init_percpu(void) { pool_cache_init(&knote_pool); } int filt_fileattach(struct knote *kn) { struct file *fp = kn->kn_fp; return fp->f_ops->fo_kqfilter(fp, kn); } int kqueue_kqfilter(struct file *fp, struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; if (kn->kn_filter != EVFILT_READ) return (EINVAL); kn->kn_fop = &kqread_filtops; klist_insert(&kq->kq_klist, kn); return (0); } void filt_kqdetach(struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; klist_remove(&kq->kq_klist, kn); } int filt_kqueue_common(struct knote *kn, struct kqueue *kq) { MUTEX_ASSERT_LOCKED(&kq->kq_lock); kn->kn_data = kq->kq_count; return (kn->kn_data > 0); } int filt_kqueue(struct knote *kn, long hint) { struct kqueue *kq = kn->kn_fp->f_data; int active; mtx_enter(&kq->kq_lock); active = filt_kqueue_common(kn, kq); mtx_leave(&kq->kq_lock); return (active); } int filt_kqueuemodify(struct kevent *kev, struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; int active; mtx_enter(&kq->kq_lock); knote_assign(kev, kn); active = filt_kqueue_common(kn, kq); mtx_leave(&kq->kq_lock); return (active); } int filt_kqueueprocess(struct knote *kn, struct kevent *kev) { struct kqueue *kq = kn->kn_fp->f_data; int active; mtx_enter(&kq->kq_lock); if (kev != NULL && (kn->kn_flags & EV_ONESHOT)) active = 1; else active = filt_kqueue_common(kn, kq); if (active) knote_submit(kn, kev); mtx_leave(&kq->kq_lock); return (active); } int filt_procattach(struct knote *kn) { struct process *pr; int s; if ((curproc->p_p->ps_flags & PS_PLEDGE) && (curproc->p_p->ps_pledge & PLEDGE_PROC) == 0) return pledge_fail(curproc, EPERM, PLEDGE_PROC); if (kn->kn_id > PID_MAX) return ESRCH; pr = prfind(kn->kn_id); if (pr == NULL) return (ESRCH); /* exiting processes can't be specified */ if (pr->ps_flags & PS_EXITING) return (ESRCH); kn->kn_ptr.p_process = pr; kn->kn_flags |= EV_CLEAR; /* automatically set */ /* * internal flag indicating registration done by kernel */ if (kn->kn_flags & EV_FLAG1) { kn->kn_data = kn->kn_sdata; /* ppid */ kn->kn_fflags = NOTE_CHILD; kn->kn_flags &= ~EV_FLAG1; } s = splhigh(); klist_insert_locked(&pr->ps_klist, kn); splx(s); return (0); } /* * The knote may be attached to a different process, which may exit, * leaving nothing for the knote to be attached to. So when the process * exits, the knote is marked as DETACHED and also flagged as ONESHOT so * it will be deleted when read out. However, as part of the knote deletion, * this routine is called, so a check is needed to avoid actually performing * a detach, because the original process does not exist any more. */ void filt_procdetach(struct knote *kn) { struct kqueue *kq = kn->kn_kq; struct process *pr = kn->kn_ptr.p_process; int s, status; mtx_enter(&kq->kq_lock); status = kn->kn_status; mtx_leave(&kq->kq_lock); if (status & KN_DETACHED) return; s = splhigh(); klist_remove_locked(&pr->ps_klist, kn); splx(s); } int filt_proc(struct knote *kn, long hint) { struct kqueue *kq = kn->kn_kq; u_int event; /* * mask off extra data */ event = (u_int)hint & NOTE_PCTRLMASK; /* * if the user is interested in this event, record it. */ if (kn->kn_sfflags & event) kn->kn_fflags |= event; /* * process is gone, so flag the event as finished and remove it * from the process's klist */ if (event == NOTE_EXIT) { struct process *pr = kn->kn_ptr.p_process; int s; mtx_enter(&kq->kq_lock); kn->kn_status |= KN_DETACHED; mtx_leave(&kq->kq_lock); s = splhigh(); kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = W_EXITCODE(pr->ps_xexit, pr->ps_xsig); klist_remove_locked(&pr->ps_klist, kn); splx(s); return (1); } /* * process forked, and user wants to track the new process, * so attach a new knote to it, and immediately report an * event with the parent's pid. */ if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) { struct kevent kev; int error; /* * register knote with new process. */ memset(&kev, 0, sizeof(kev)); kev.ident = hint & NOTE_PDATAMASK; /* pid */ kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_udata; /* preserve udata */ error = kqueue_register(kq, &kev, 0, NULL); if (error) kn->kn_fflags |= NOTE_TRACKERR; } return (kn->kn_fflags != 0); } static void filt_timer_timeout_add(struct knote *kn) { struct timeval tv; struct timeout *to = kn->kn_hook; int tticks; tv.tv_sec = kn->kn_sdata / 1000; tv.tv_usec = (kn->kn_sdata % 1000) * 1000; tticks = tvtohz(&tv); /* Remove extra tick from tvtohz() if timeout has fired before. */ if (timeout_triggered(to)) tticks--; timeout_add(to, (tticks > 0) ? tticks : 1); } void filt_timerexpire(void *knx) { struct knote *kn = knx; struct kqueue *kq = kn->kn_kq; kn->kn_data++; mtx_enter(&kq->kq_lock); knote_activate(kn); mtx_leave(&kq->kq_lock); if ((kn->kn_flags & EV_ONESHOT) == 0) filt_timer_timeout_add(kn); } /* * data contains amount of time to sleep, in milliseconds */ int filt_timerattach(struct knote *kn) { struct timeout *to; if (kq_ntimeouts > kq_timeoutmax) return (ENOMEM); kq_ntimeouts++; kn->kn_flags |= EV_CLEAR; /* automatically set */ to = malloc(sizeof(*to), M_KEVENT, M_WAITOK); timeout_set(to, filt_timerexpire, kn); kn->kn_hook = to; filt_timer_timeout_add(kn); return (0); } void filt_timerdetach(struct knote *kn) { struct timeout *to; to = (struct timeout *)kn->kn_hook; timeout_del_barrier(to); free(to, M_KEVENT, sizeof(*to)); kq_ntimeouts--; } int filt_timermodify(struct kevent *kev, struct knote *kn) { struct kqueue *kq = kn->kn_kq; struct timeout *to = kn->kn_hook; /* Reset the timer. Any pending events are discarded. */ timeout_del_barrier(to); mtx_enter(&kq->kq_lock); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); kn->kn_status &= ~KN_ACTIVE; mtx_leave(&kq->kq_lock); kn->kn_data = 0; knote_assign(kev, kn); /* Reinit timeout to invoke tick adjustment again. */ timeout_set(to, filt_timerexpire, kn); filt_timer_timeout_add(kn); return (0); } int filt_timerprocess(struct knote *kn, struct kevent *kev) { int active, s; s = splsoftclock(); active = (kn->kn_data != 0); if (active) knote_submit(kn, kev); splx(s); return (active); } /* * filt_seltrue: * * This filter "event" routine simulates seltrue(). */ int filt_seltrue(struct knote *kn, long hint) { /* * We don't know how much data can be read/written, * but we know that it *can* be. This is about as * good as select/poll does as well. */ kn->kn_data = 0; return (1); } int filt_seltruemodify(struct kevent *kev, struct knote *kn) { knote_assign(kev, kn); return (kn->kn_fop->f_event(kn, 0)); } int filt_seltrueprocess(struct knote *kn, struct kevent *kev) { int active; active = kn->kn_fop->f_event(kn, 0); if (active) knote_submit(kn, kev); return (active); } /* * This provides full kqfilter entry for device switch tables, which * has same effect as filter using filt_seltrue() as filter method. */ void filt_seltruedetach(struct knote *kn) { /* Nothing to do */ } const struct filterops seltrue_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_seltruedetach, .f_event = filt_seltrue, .f_modify = filt_seltruemodify, .f_process = filt_seltrueprocess, }; int seltrue_kqfilter(dev_t dev, struct knote *kn) { switch (kn->kn_filter) { case EVFILT_READ: case EVFILT_WRITE: kn->kn_fop = &seltrue_filtops; break; default: return (EINVAL); } /* Nothing more to do */ return (0); } static int filt_dead(struct knote *kn, long hint) { if (kn->kn_filter == EVFILT_EXCEPT) { /* * Do not deliver event because there is no out-of-band data. * However, let HUP condition pass for poll(2). */ if ((kn->kn_flags & __EV_POLL) == 0) { kn->kn_flags |= EV_DISABLE; return (0); } } kn->kn_flags |= (EV_EOF | EV_ONESHOT); if (kn->kn_flags & __EV_POLL) kn->kn_flags |= __EV_HUP; kn->kn_data = 0; return (1); } static void filt_deaddetach(struct knote *kn) { /* Nothing to do */ } const struct filterops dead_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_deaddetach, .f_event = filt_dead, .f_modify = filt_seltruemodify, .f_process = filt_seltrueprocess, }; static int filt_badfd(struct knote *kn, long hint) { kn->kn_flags |= (EV_ERROR | EV_ONESHOT); kn->kn_data = EBADF; return (1); } /* For use with kqpoll. */ const struct filterops badfd_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_deaddetach, .f_event = filt_badfd, .f_modify = filt_seltruemodify, .f_process = filt_seltrueprocess, }; static int filter_attach(struct knote *kn) { int error; if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { error = kn->kn_fop->f_attach(kn); } else { KERNEL_LOCK(); error = kn->kn_fop->f_attach(kn); KERNEL_UNLOCK(); } return (error); } static void filter_detach(struct knote *kn) { if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { kn->kn_fop->f_detach(kn); } else { KERNEL_LOCK(); kn->kn_fop->f_detach(kn); KERNEL_UNLOCK(); } } static int filter_event(struct knote *kn, long hint) { if ((kn->kn_fop->f_flags & FILTEROP_MPSAFE) == 0) KERNEL_ASSERT_LOCKED(); return (kn->kn_fop->f_event(kn, hint)); } static int filter_modify(struct kevent *kev, struct knote *kn) { int active, s; if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { active = kn->kn_fop->f_modify(kev, kn); } else { KERNEL_LOCK(); if (kn->kn_fop->f_modify != NULL) { active = kn->kn_fop->f_modify(kev, kn); } else { s = splhigh(); active = knote_modify(kev, kn); splx(s); } KERNEL_UNLOCK(); } return (active); } static int filter_process(struct knote *kn, struct kevent *kev) { int active, s; if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { active = kn->kn_fop->f_process(kn, kev); } else { KERNEL_LOCK(); if (kn->kn_fop->f_process != NULL) { active = kn->kn_fop->f_process(kn, kev); } else { s = splhigh(); active = knote_process(kn, kev); splx(s); } KERNEL_UNLOCK(); } return (active); } /* * Initialize the current thread for poll/select system call. * num indicates the number of serials that the system call may utilize. * After this function, the valid range of serials is * p_kq_serial <= x < p_kq_serial + num. */ void kqpoll_init(unsigned int num) { struct proc *p = curproc; struct filedesc *fdp; if (p->p_kq == NULL) { p->p_kq = kqueue_alloc(p->p_fd); p->p_kq_serial = arc4random(); fdp = p->p_fd; fdplock(fdp); LIST_INSERT_HEAD(&fdp->fd_kqlist, p->p_kq, kq_next); fdpunlock(fdp); } if (p->p_kq_serial + num < p->p_kq_serial) { /* Serial is about to wrap. Clear all attached knotes. */ kqueue_purge(p, p->p_kq); p->p_kq_serial = 0; } } /* * Finish poll/select system call. * num must have the same value that was used with kqpoll_init(). */ void kqpoll_done(unsigned int num) { struct proc *p = curproc; struct kqueue *kq = p->p_kq; KASSERT(p->p_kq != NULL); KASSERT(p->p_kq_serial + num >= p->p_kq_serial); p->p_kq_serial += num; /* * Because of kn_pollid key, a thread can in principle allocate * up to O(maxfiles^2) knotes by calling poll(2) repeatedly * with suitably varying pollfd arrays. * Prevent such a large allocation by clearing knotes eagerly * if there are too many of them. * * A small multiple of kq_knlistsize should give enough margin * that eager clearing is infrequent, or does not happen at all, * with normal programs. * A single pollfd entry can use up to three knotes. * Typically there is no significant overlap of fd and events * between different entries in the pollfd array. */ if (kq->kq_nknotes > 4 * kq->kq_knlistsize) kqueue_purge(p, kq); } void kqpoll_exit(void) { struct proc *p = curproc; if (p->p_kq == NULL) return; kqueue_purge(p, p->p_kq); kqueue_terminate(p, p->p_kq); KASSERT(p->p_kq->kq_refcnt.r_refs == 1); KQRELE(p->p_kq); p->p_kq = NULL; } struct kqueue * kqueue_alloc(struct filedesc *fdp) { struct kqueue *kq; kq = pool_get(&kqueue_pool, PR_WAITOK | PR_ZERO); refcnt_init(&kq->kq_refcnt); kq->kq_fdp = fdp; TAILQ_INIT(&kq->kq_head); mtx_init(&kq->kq_lock, IPL_HIGH); task_set(&kq->kq_task, kqueue_task, kq); klist_init_mutex(&kq->kq_klist, &kqueue_klist_lock); return (kq); } int sys_kqueue(struct proc *p, void *v, register_t *retval) { struct filedesc *fdp = p->p_fd; struct kqueue *kq; struct file *fp; int fd, error; kq = kqueue_alloc(fdp); fdplock(fdp); error = falloc(p, &fp, &fd); if (error) goto out; fp->f_flag = FREAD | FWRITE; fp->f_type = DTYPE_KQUEUE; fp->f_ops = &kqueueops; fp->f_data = kq; *retval = fd; LIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_next); kq = NULL; fdinsert(fdp, fd, 0, fp); FRELE(fp, p); out: fdpunlock(fdp); if (kq != NULL) pool_put(&kqueue_pool, kq); return (error); } int sys_kevent(struct proc *p, void *v, register_t *retval) { struct kqueue_scan_state scan; struct filedesc* fdp = p->p_fd; struct sys_kevent_args /* { syscallarg(int) fd; syscallarg(const struct kevent *) changelist; syscallarg(int) nchanges; syscallarg(struct kevent *) eventlist; syscallarg(int) nevents; syscallarg(const struct timespec *) timeout; } */ *uap = v; struct kevent *kevp; struct kqueue *kq; struct file *fp; struct timespec ts; struct timespec *tsp = NULL; int i, n, nerrors, error; int ready, total; struct kevent kev[KQ_NEVENTS]; if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL) return (EBADF); if (fp->f_type != DTYPE_KQUEUE) { error = EBADF; goto done; } if (SCARG(uap, timeout) != NULL) { error = copyin(SCARG(uap, timeout), &ts, sizeof(ts)); if (error) goto done; #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktrreltimespec(p, &ts); #endif if (ts.tv_sec < 0 || !timespecisvalid(&ts)) { error = EINVAL; goto done; } tsp = &ts; } kq = fp->f_data; nerrors = 0; while ((n = SCARG(uap, nchanges)) > 0) { if (n > nitems(kev)) n = nitems(kev); error = copyin(SCARG(uap, changelist), kev, n * sizeof(struct kevent)); if (error) goto done; #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktrevent(p, kev, n); #endif for (i = 0; i < n; i++) { kevp = &kev[i]; kevp->flags &= ~EV_SYSFLAGS; error = kqueue_register(kq, kevp, 0, p); if (error || (kevp->flags & EV_RECEIPT)) { if (SCARG(uap, nevents) != 0) { kevp->flags = EV_ERROR; kevp->data = error; copyout(kevp, SCARG(uap, eventlist), sizeof(*kevp)); SCARG(uap, eventlist)++; SCARG(uap, nevents)--; nerrors++; } else { goto done; } } } SCARG(uap, nchanges) -= n; SCARG(uap, changelist) += n; } if (nerrors) { *retval = nerrors; error = 0; goto done; } kqueue_scan_setup(&scan, kq); FRELE(fp, p); /* * Collect as many events as we can. The timeout on successive * loops is disabled (kqueue_scan() becomes non-blocking). */ total = 0; error = 0; while ((n = SCARG(uap, nevents) - total) > 0) { if (n > nitems(kev)) n = nitems(kev); ready = kqueue_scan(&scan, n, kev, tsp, p, &error); if (ready == 0) break; error = copyout(kev, SCARG(uap, eventlist) + total, sizeof(struct kevent) * ready); #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktrevent(p, kev, ready); #endif total += ready; if (error || ready < n) break; } kqueue_scan_finish(&scan); *retval = total; return (error); done: FRELE(fp, p); return (error); } #ifdef KQUEUE_DEBUG void kqueue_do_check(struct kqueue *kq, const char *func, int line) { struct knote *kn; int count = 0, nmarker = 0; MUTEX_ASSERT_LOCKED(&kq->kq_lock); TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) { if (kn->kn_filter == EVFILT_MARKER) { if ((kn->kn_status & KN_QUEUED) != 0) panic("%s:%d: kq=%p kn=%p marker QUEUED", func, line, kq, kn); nmarker++; } else { if ((kn->kn_status & KN_ACTIVE) == 0) panic("%s:%d: kq=%p kn=%p knote !ACTIVE", func, line, kq, kn); if ((kn->kn_status & KN_QUEUED) == 0) panic("%s:%d: kq=%p kn=%p knote !QUEUED", func, line, kq, kn); if (kn->kn_kq != kq) panic("%s:%d: kq=%p kn=%p kn_kq=%p != kq", func, line, kq, kn, kn->kn_kq); count++; if (count > kq->kq_count) goto bad; } } if (count != kq->kq_count) { bad: panic("%s:%d: kq=%p kq_count=%d count=%d nmarker=%d", func, line, kq, kq->kq_count, count, nmarker); } } #endif int kqueue_register(struct kqueue *kq, struct kevent *kev, unsigned int pollid, struct proc *p) { struct filedesc *fdp = kq->kq_fdp; const struct filterops *fops = NULL; struct file *fp = NULL; struct knote *kn = NULL, *newkn = NULL; struct knlist *list = NULL; int active, error = 0; KASSERT(pollid == 0 || (p != NULL && p->p_kq == kq)); if (kev->filter < 0) { if (kev->filter + EVFILT_SYSCOUNT < 0) return (EINVAL); fops = sysfilt_ops[~kev->filter]; /* to 0-base index */ } if (fops == NULL) { /* * XXX * filter attach routine is responsible for ensuring that * the identifier can be attached to it. */ return (EINVAL); } if (fops->f_flags & FILTEROP_ISFD) { /* validate descriptor */ if (kev->ident > INT_MAX) return (EBADF); } if (kev->flags & EV_ADD) newkn = pool_get(&knote_pool, PR_WAITOK | PR_ZERO); again: if (fops->f_flags & FILTEROP_ISFD) { if ((fp = fd_getfile(fdp, kev->ident)) == NULL) { error = EBADF; goto done; } mtx_enter(&kq->kq_lock); if (kev->flags & EV_ADD) kqueue_expand_list(kq, kev->ident); if (kev->ident < kq->kq_knlistsize) list = &kq->kq_knlist[kev->ident]; } else { mtx_enter(&kq->kq_lock); if (kev->flags & EV_ADD) kqueue_expand_hash(kq); if (kq->kq_knhashmask != 0) { list = &kq->kq_knhash[ KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; } } if (list != NULL) { SLIST_FOREACH(kn, list, kn_link) { if (kev->filter == kn->kn_filter && kev->ident == kn->kn_id && pollid == kn->kn_pollid) { if (!knote_acquire(kn, NULL, 0)) { /* knote_acquire() has released * kq_lock. */ if (fp != NULL) { FRELE(fp, p); fp = NULL; } goto again; } break; } } } KASSERT(kn == NULL || (kn->kn_status & KN_PROCESSING) != 0); if (kn == NULL && ((kev->flags & EV_ADD) == 0)) { mtx_leave(&kq->kq_lock); error = ENOENT; goto done; } /* * kn now contains the matching knote, or NULL if no match. */ if (kev->flags & EV_ADD) { if (kn == NULL) { kn = newkn; newkn = NULL; kn->kn_status = KN_PROCESSING; kn->kn_fp = fp; kn->kn_kq = kq; kn->kn_fop = fops; /* * apply reference count to knote structure, and * do not release it at the end of this routine. */ fp = NULL; kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kev->fflags = 0; kev->data = 0; kn->kn_kevent = *kev; kn->kn_pollid = pollid; knote_attach(kn); mtx_leave(&kq->kq_lock); error = filter_attach(kn); if (error != 0) { knote_drop(kn, p); goto done; } /* * If this is a file descriptor filter, check if * fd was closed while the knote was being added. * knote_fdclose() has missed kn if the function * ran before kn appeared in kq_knlist. */ if ((fops->f_flags & FILTEROP_ISFD) && fd_checkclosed(fdp, kev->ident, kn->kn_fp)) { /* * Drop the knote silently without error * because another thread might already have * seen it. This corresponds to the insert * happening in full before the close. */ filter_detach(kn); knote_drop(kn, p); goto done; } /* Check if there is a pending event. */ active = filter_process(kn, NULL); mtx_enter(&kq->kq_lock); if (active) knote_activate(kn); } else if (kn->kn_fop == &badfd_filtops) { /* * Nothing expects this badfd knote any longer. * Drop it to make room for the new knote and retry. */ KASSERT(kq == p->p_kq); mtx_leave(&kq->kq_lock); filter_detach(kn); knote_drop(kn, p); KASSERT(fp != NULL); FRELE(fp, p); fp = NULL; goto again; } else { /* * The user may change some filter values after the * initial EV_ADD, but doing so will not reset any * filters which have already been triggered. */ mtx_leave(&kq->kq_lock); active = filter_modify(kev, kn); mtx_enter(&kq->kq_lock); if (active) knote_activate(kn); if (kev->flags & EV_ERROR) { error = kev->data; goto release; } } } else if (kev->flags & EV_DELETE) { mtx_leave(&kq->kq_lock); filter_detach(kn); knote_drop(kn, p); goto done; } if ((kev->flags & EV_DISABLE) && ((kn->kn_status & KN_DISABLED) == 0)) kn->kn_status |= KN_DISABLED; if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { kn->kn_status &= ~KN_DISABLED; mtx_leave(&kq->kq_lock); /* Check if there is a pending event. */ active = filter_process(kn, NULL); mtx_enter(&kq->kq_lock); if (active) knote_activate(kn); } release: knote_release(kn); mtx_leave(&kq->kq_lock); done: if (fp != NULL) FRELE(fp, p); if (newkn != NULL) pool_put(&knote_pool, newkn); return (error); } int kqueue_sleep(struct kqueue *kq, struct timespec *tsp) { struct timespec elapsed, start, stop; uint64_t nsecs; int error; MUTEX_ASSERT_LOCKED(&kq->kq_lock); if (tsp != NULL) { getnanouptime(&start); nsecs = MIN(TIMESPEC_TO_NSEC(tsp), MAXTSLP); } else nsecs = INFSLP; error = msleep_nsec(kq, &kq->kq_lock, PSOCK | PCATCH | PNORELOCK, "kqread", nsecs); if (tsp != NULL) { getnanouptime(&stop); timespecsub(&stop, &start, &elapsed); timespecsub(tsp, &elapsed, tsp); if (tsp->tv_sec < 0) timespecclear(tsp); } return (error); } /* * Scan the kqueue, blocking if necessary until the target time is reached. * If tsp is NULL we block indefinitely. If tsp->ts_secs/nsecs are both * 0 we do not block at all. */ int kqueue_scan(struct kqueue_scan_state *scan, int maxevents, struct kevent *kevp, struct timespec *tsp, struct proc *p, int *errorp) { struct kqueue *kq = scan->kqs_kq; struct knote *kn; int error = 0, nkev = 0; int reinserted; if (maxevents == 0) goto done; retry: KASSERT(nkev == 0); error = 0; reinserted = 0; /* msleep() with PCATCH requires kernel lock. */ KERNEL_LOCK(); mtx_enter(&kq->kq_lock); if (kq->kq_state & KQ_DYING) { mtx_leave(&kq->kq_lock); KERNEL_UNLOCK(); error = EBADF; goto done; } if (kq->kq_count == 0) { /* * Successive loops are only necessary if there are more * ready events to gather, so they don't need to block. */ if ((tsp != NULL && !timespecisset(tsp)) || scan->kqs_nevent != 0) { mtx_leave(&kq->kq_lock); KERNEL_UNLOCK(); error = 0; goto done; } kq->kq_state |= KQ_SLEEP; error = kqueue_sleep(kq, tsp); /* kqueue_sleep() has released kq_lock. */ KERNEL_UNLOCK(); if (error == 0 || error == EWOULDBLOCK) goto retry; /* don't restart after signals... */ if (error == ERESTART) error = EINTR; goto done; } /* The actual scan does not sleep on kq, so unlock the kernel. */ KERNEL_UNLOCK(); /* * Put the end marker in the queue to limit the scan to the events * that are currently active. This prevents events from being * recollected if they reactivate during scan. * * If a partial scan has been performed already but no events have * been collected, reposition the end marker to make any new events * reachable. */ if (!scan->kqs_queued) { TAILQ_INSERT_TAIL(&kq->kq_head, &scan->kqs_end, kn_tqe); scan->kqs_queued = 1; } else if (scan->kqs_nevent == 0) { TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe); TAILQ_INSERT_TAIL(&kq->kq_head, &scan->kqs_end, kn_tqe); } TAILQ_INSERT_HEAD(&kq->kq_head, &scan->kqs_start, kn_tqe); while (nkev < maxevents) { kn = TAILQ_NEXT(&scan->kqs_start, kn_tqe); if (kn->kn_filter == EVFILT_MARKER) { if (kn == &scan->kqs_end) break; /* Move start marker past another thread's marker. */ TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe); TAILQ_INSERT_AFTER(&kq->kq_head, kn, &scan->kqs_start, kn_tqe); continue; } if (!knote_acquire(kn, NULL, 0)) { /* knote_acquire() has released kq_lock. */ mtx_enter(&kq->kq_lock); continue; } kqueue_check(kq); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; kqueue_check(kq); if (kn->kn_status & KN_DISABLED) { knote_release(kn); continue; } mtx_leave(&kq->kq_lock); /* Drop expired kqpoll knotes. */ if (p->p_kq == kq && p->p_kq_serial > (unsigned long)kn->kn_udata) { filter_detach(kn); knote_drop(kn, p); mtx_enter(&kq->kq_lock); continue; } /* * Invalidate knotes whose vnodes have been revoked. * This is a workaround; it is tricky to clear existing * knotes and prevent new ones from being registered * with the current revocation mechanism. */ if ((kn->kn_fop->f_flags & FILTEROP_ISFD) && kn->kn_fp != NULL && kn->kn_fp->f_type == DTYPE_VNODE) { struct vnode *vp = kn->kn_fp->f_data; if (__predict_false(vp->v_op == &dead_vops && kn->kn_fop != &dead_filtops)) { filter_detach(kn); kn->kn_fop = &dead_filtops; /* * Check if the event should be delivered. * Use f_event directly because this is * a special situation. */ if (kn->kn_fop->f_event(kn, 0) == 0) { filter_detach(kn); knote_drop(kn, p); mtx_enter(&kq->kq_lock); continue; } } } memset(kevp, 0, sizeof(*kevp)); if (filter_process(kn, kevp) == 0) { mtx_enter(&kq->kq_lock); if ((kn->kn_status & KN_QUEUED) == 0) kn->kn_status &= ~KN_ACTIVE; knote_release(kn); kqueue_check(kq); continue; } /* * Post-event action on the note */ if (kevp->flags & EV_ONESHOT) { filter_detach(kn); knote_drop(kn, p); mtx_enter(&kq->kq_lock); } else if (kevp->flags & (EV_CLEAR | EV_DISPATCH)) { mtx_enter(&kq->kq_lock); if (kevp->flags & EV_DISPATCH) kn->kn_status |= KN_DISABLED; if ((kn->kn_status & KN_QUEUED) == 0) kn->kn_status &= ~KN_ACTIVE; knote_release(kn); } else { mtx_enter(&kq->kq_lock); if ((kn->kn_status & KN_QUEUED) == 0) { kqueue_check(kq); kq->kq_count++; kn->kn_status |= KN_QUEUED; TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); /* Wakeup is done after loop. */ reinserted = 1; } knote_release(kn); } kqueue_check(kq); kevp++; nkev++; scan->kqs_nevent++; } TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe); if (reinserted && kq->kq_count != 0) kqueue_wakeup(kq); mtx_leave(&kq->kq_lock); if (scan->kqs_nevent == 0) goto retry; done: *errorp = error; return (nkev); } void kqueue_scan_setup(struct kqueue_scan_state *scan, struct kqueue *kq) { memset(scan, 0, sizeof(*scan)); KQREF(kq); scan->kqs_kq = kq; scan->kqs_start.kn_filter = EVFILT_MARKER; scan->kqs_start.kn_status = KN_PROCESSING; scan->kqs_end.kn_filter = EVFILT_MARKER; scan->kqs_end.kn_status = KN_PROCESSING; } void kqueue_scan_finish(struct kqueue_scan_state *scan) { struct kqueue *kq = scan->kqs_kq; KASSERT(scan->kqs_start.kn_filter == EVFILT_MARKER); KASSERT(scan->kqs_start.kn_status == KN_PROCESSING); KASSERT(scan->kqs_end.kn_filter == EVFILT_MARKER); KASSERT(scan->kqs_end.kn_status == KN_PROCESSING); if (scan->kqs_queued) { scan->kqs_queued = 0; mtx_enter(&kq->kq_lock); TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe); mtx_leave(&kq->kq_lock); } KQRELE(kq); } /* * XXX * This could be expanded to call kqueue_scan, if desired. */ int kqueue_read(struct file *fp, struct uio *uio, int fflags) { return (ENXIO); } int kqueue_write(struct file *fp, struct uio *uio, int fflags) { return (ENXIO); } int kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p) { return (ENOTTY); } int kqueue_stat(struct file *fp, struct stat *st, struct proc *p) { struct kqueue *kq = fp->f_data; memset(st, 0, sizeof(*st)); st->st_size = kq->kq_count; /* unlocked read */ st->st_blksize = sizeof(struct kevent); st->st_mode = S_IFIFO; return (0); } void kqueue_purge(struct proc *p, struct kqueue *kq) { int i; mtx_enter(&kq->kq_lock); for (i = 0; i < kq->kq_knlistsize; i++) knote_remove(p, kq, &kq->kq_knlist, i, 1); if (kq->kq_knhashmask != 0) { for (i = 0; i < kq->kq_knhashmask + 1; i++) knote_remove(p, kq, &kq->kq_knhash, i, 1); } mtx_leave(&kq->kq_lock); } void kqueue_terminate(struct proc *p, struct kqueue *kq) { struct knote *kn; int state; mtx_enter(&kq->kq_lock); /* * Any remaining entries should be scan markers. * They are removed when the ongoing scans finish. */ KASSERT(kq->kq_count == 0); TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) KASSERT(kn->kn_filter == EVFILT_MARKER); kq->kq_state |= KQ_DYING; state = kq->kq_state; kqueue_wakeup(kq); mtx_leave(&kq->kq_lock); /* * Any knotes that were attached to this kqueue were deleted * by knote_fdclose() when this kqueue's file descriptor was closed. */ KASSERT(klist_empty(&kq->kq_klist)); if (state & KQ_TASK) taskq_del_barrier(systqmp, &kq->kq_task); } int kqueue_close(struct file *fp, struct proc *p) { struct kqueue *kq = fp->f_data; fp->f_data = NULL; kqueue_purge(p, kq); kqueue_terminate(p, kq); KQRELE(kq); return (0); } static void kqueue_task(void *arg) { struct kqueue *kq = arg; mtx_enter(&kqueue_klist_lock); KNOTE(&kq->kq_klist, 0); mtx_leave(&kqueue_klist_lock); } void kqueue_wakeup(struct kqueue *kq) { MUTEX_ASSERT_LOCKED(&kq->kq_lock); if (kq->kq_state & KQ_SLEEP) { kq->kq_state &= ~KQ_SLEEP; wakeup(kq); } if (!klist_empty(&kq->kq_klist)) { /* Defer activation to avoid recursion. */ kq->kq_state |= KQ_TASK; task_add(systqmp, &kq->kq_task); } } static void kqueue_expand_hash(struct kqueue *kq) { struct knlist *hash; u_long hashmask; MUTEX_ASSERT_LOCKED(&kq->kq_lock); if (kq->kq_knhashmask == 0) { mtx_leave(&kq->kq_lock); hash = hashinit(KN_HASHSIZE, M_KEVENT, M_WAITOK, &hashmask); mtx_enter(&kq->kq_lock); if (kq->kq_knhashmask == 0) { kq->kq_knhash = hash; kq->kq_knhashmask = hashmask; } else { /* Another thread has allocated the hash. */ mtx_leave(&kq->kq_lock); hashfree(hash, KN_HASHSIZE, M_KEVENT); mtx_enter(&kq->kq_lock); } } } static void kqueue_expand_list(struct kqueue *kq, int fd) { struct knlist *list, *olist; int size, osize; MUTEX_ASSERT_LOCKED(&kq->kq_lock); if (kq->kq_knlistsize <= fd) { size = kq->kq_knlistsize; mtx_leave(&kq->kq_lock); while (size <= fd) size += KQEXTENT; list = mallocarray(size, sizeof(*list), M_KEVENT, M_WAITOK); mtx_enter(&kq->kq_lock); if (kq->kq_knlistsize <= fd) { memcpy(list, kq->kq_knlist, kq->kq_knlistsize * sizeof(*list)); memset(&list[kq->kq_knlistsize], 0, (size - kq->kq_knlistsize) * sizeof(*list)); olist = kq->kq_knlist; osize = kq->kq_knlistsize; kq->kq_knlist = list; kq->kq_knlistsize = size; mtx_leave(&kq->kq_lock); free(olist, M_KEVENT, osize * sizeof(*list)); mtx_enter(&kq->kq_lock); } else { /* Another thread has expanded the list. */ mtx_leave(&kq->kq_lock); free(list, M_KEVENT, size * sizeof(*list)); mtx_enter(&kq->kq_lock); } } } /* * Acquire a knote, return non-zero on success, 0 on failure. * * If we cannot acquire the knote we sleep and return 0. The knote * may be stale on return in this case and the caller must restart * whatever loop they are in. * * If we are about to sleep and klist is non-NULL, the list is unlocked * before sleep and remains unlocked on return. */ int knote_acquire(struct knote *kn, struct klist *klist, int ls) { struct kqueue *kq = kn->kn_kq; MUTEX_ASSERT_LOCKED(&kq->kq_lock); KASSERT(kn->kn_filter != EVFILT_MARKER); if (kn->kn_status & KN_PROCESSING) { kn->kn_status |= KN_WAITING; if (klist != NULL) { mtx_leave(&kq->kq_lock); klist_unlock(klist, ls); /* XXX Timeout resolves potential loss of wakeup. */ tsleep_nsec(kn, 0, "kqepts", SEC_TO_NSEC(1)); } else { msleep_nsec(kn, &kq->kq_lock, PNORELOCK, "kqepts", SEC_TO_NSEC(1)); } /* knote may be stale now */ return (0); } kn->kn_status |= KN_PROCESSING; return (1); } /* * Release an acquired knote, clearing KN_PROCESSING. */ void knote_release(struct knote *kn) { MUTEX_ASSERT_LOCKED(&kn->kn_kq->kq_lock); KASSERT(kn->kn_filter != EVFILT_MARKER); KASSERT(kn->kn_status & KN_PROCESSING); if (kn->kn_status & KN_WAITING) { kn->kn_status &= ~KN_WAITING; wakeup(kn); } kn->kn_status &= ~KN_PROCESSING; /* kn should not be accessed anymore */ } /* * activate one knote. */ void knote_activate(struct knote *kn) { MUTEX_ASSERT_LOCKED(&kn->kn_kq->kq_lock); kn->kn_status |= KN_ACTIVE; if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) knote_enqueue(kn); } /* * walk down a list of knotes, activating them if their event has triggered. */ void knote(struct klist *list, long hint) { struct knote *kn, *kn0; struct kqueue *kq; KLIST_ASSERT_LOCKED(list); SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, kn0) { if (filter_event(kn, hint)) { kq = kn->kn_kq; mtx_enter(&kq->kq_lock); knote_activate(kn); mtx_leave(&kq->kq_lock); } } } /* * remove all knotes from a specified knlist */ void knote_remove(struct proc *p, struct kqueue *kq, struct knlist **plist, int idx, int purge) { struct knote *kn; MUTEX_ASSERT_LOCKED(&kq->kq_lock); /* Always fetch array pointer as another thread can resize kq_knlist. */ while ((kn = SLIST_FIRST(*plist + idx)) != NULL) { KASSERT(kn->kn_kq == kq); if (!purge) { /* Skip pending badfd knotes. */ while (kn->kn_fop == &badfd_filtops) { kn = SLIST_NEXT(kn, kn_link); if (kn == NULL) return; KASSERT(kn->kn_kq == kq); } } if (!knote_acquire(kn, NULL, 0)) { /* knote_acquire() has released kq_lock. */ mtx_enter(&kq->kq_lock); continue; } mtx_leave(&kq->kq_lock); filter_detach(kn); /* * Notify poll(2) and select(2) when a monitored * file descriptor is closed. * * This reuses the original knote for delivering the * notification so as to avoid allocating memory. */ if (!purge && (kn->kn_flags & (__EV_POLL | __EV_SELECT)) && !(p->p_kq == kq && p->p_kq_serial > (unsigned long)kn->kn_udata) && kn->kn_fop != &badfd_filtops) { KASSERT(kn->kn_fop->f_flags & FILTEROP_ISFD); FRELE(kn->kn_fp, p); kn->kn_fp = NULL; kn->kn_fop = &badfd_filtops; filter_event(kn, 0); mtx_enter(&kq->kq_lock); knote_activate(kn); knote_release(kn); continue; } knote_drop(kn, p); mtx_enter(&kq->kq_lock); } } /* * remove all knotes referencing a specified fd */ void knote_fdclose(struct proc *p, int fd) { struct filedesc *fdp = p->p_p->ps_fd; struct kqueue *kq; /* * fdplock can be ignored if the file descriptor table is being freed * because no other thread can access the fdp. */ if (fdp->fd_refcnt != 0) fdpassertlocked(fdp); LIST_FOREACH(kq, &fdp->fd_kqlist, kq_next) { mtx_enter(&kq->kq_lock); if (fd < kq->kq_knlistsize) knote_remove(p, kq, &kq->kq_knlist, fd, 0); mtx_leave(&kq->kq_lock); } } /* * handle a process exiting, including the triggering of NOTE_EXIT notes * XXX this could be more efficient, doing a single pass down the klist */ void knote_processexit(struct process *pr) { KERNEL_ASSERT_LOCKED(); KNOTE(&pr->ps_klist, NOTE_EXIT); /* remove other knotes hanging off the process */ klist_invalidate(&pr->ps_klist); } void knote_attach(struct knote *kn) { struct kqueue *kq = kn->kn_kq; struct knlist *list; MUTEX_ASSERT_LOCKED(&kq->kq_lock); KASSERT(kn->kn_status & KN_PROCESSING); if (kn->kn_fop->f_flags & FILTEROP_ISFD) { KASSERT(kq->kq_knlistsize > kn->kn_id); list = &kq->kq_knlist[kn->kn_id]; } else { KASSERT(kq->kq_knhashmask != 0); list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; } SLIST_INSERT_HEAD(list, kn, kn_link); kq->kq_nknotes++; } void knote_detach(struct knote *kn) { struct kqueue *kq = kn->kn_kq; struct knlist *list; MUTEX_ASSERT_LOCKED(&kq->kq_lock); KASSERT(kn->kn_status & KN_PROCESSING); kq->kq_nknotes--; if (kn->kn_fop->f_flags & FILTEROP_ISFD) list = &kq->kq_knlist[kn->kn_id]; else list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; SLIST_REMOVE(list, kn, knote, kn_link); } /* * should be called at spl == 0, since we don't want to hold spl * while calling FRELE and pool_put. */ void knote_drop(struct knote *kn, struct proc *p) { struct kqueue *kq = kn->kn_kq; KASSERT(kn->kn_filter != EVFILT_MARKER); mtx_enter(&kq->kq_lock); knote_detach(kn); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); if (kn->kn_status & KN_WAITING) { kn->kn_status &= ~KN_WAITING; wakeup(kn); } mtx_leave(&kq->kq_lock); if ((kn->kn_fop->f_flags & FILTEROP_ISFD) && kn->kn_fp != NULL) FRELE(kn->kn_fp, p); pool_put(&knote_pool, kn); } void knote_enqueue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; MUTEX_ASSERT_LOCKED(&kq->kq_lock); KASSERT(kn->kn_filter != EVFILT_MARKER); KASSERT((kn->kn_status & KN_QUEUED) == 0); kqueue_check(kq); TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status |= KN_QUEUED; kq->kq_count++; kqueue_check(kq); kqueue_wakeup(kq); } void knote_dequeue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; MUTEX_ASSERT_LOCKED(&kq->kq_lock); KASSERT(kn->kn_filter != EVFILT_MARKER); KASSERT(kn->kn_status & KN_QUEUED); kqueue_check(kq); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; kqueue_check(kq); } /* * Assign parameters to the knote. * * The knote's object lock must be held. */ void knote_assign(const struct kevent *kev, struct knote *kn) { if ((kn->kn_fop->f_flags & FILTEROP_MPSAFE) == 0) KERNEL_ASSERT_LOCKED(); kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kn->kn_udata = kev->udata; } /* * Submit the knote's event for delivery. * * The knote's object lock must be held. */ void knote_submit(struct knote *kn, struct kevent *kev) { if ((kn->kn_fop->f_flags & FILTEROP_MPSAFE) == 0) KERNEL_ASSERT_LOCKED(); if (kev != NULL) { *kev = kn->kn_kevent; if (kn->kn_flags & EV_CLEAR) { kn->kn_fflags = 0; kn->kn_data = 0; } } } void klist_init(struct klist *klist, const struct klistops *ops, void *arg) { SLIST_INIT(&klist->kl_list); klist->kl_ops = ops; klist->kl_arg = arg; } void klist_free(struct klist *klist) { KASSERT(SLIST_EMPTY(&klist->kl_list)); } void klist_insert(struct klist *klist, struct knote *kn) { int ls; ls = klist_lock(klist); SLIST_INSERT_HEAD(&klist->kl_list, kn, kn_selnext); klist_unlock(klist, ls); } void klist_insert_locked(struct klist *klist, struct knote *kn) { KLIST_ASSERT_LOCKED(klist); SLIST_INSERT_HEAD(&klist->kl_list, kn, kn_selnext); } void klist_remove(struct klist *klist, struct knote *kn) { int ls; ls = klist_lock(klist); SLIST_REMOVE(&klist->kl_list, kn, knote, kn_selnext); klist_unlock(klist, ls); } void klist_remove_locked(struct klist *klist, struct knote *kn) { KLIST_ASSERT_LOCKED(klist); SLIST_REMOVE(&klist->kl_list, kn, knote, kn_selnext); } /* * Detach all knotes from klist. The knotes are rewired to indicate EOF. * * The caller of this function must not hold any locks that can block * filterops callbacks that run with KN_PROCESSING. * Otherwise this function might deadlock. */ void klist_invalidate(struct klist *list) { struct knote *kn; struct kqueue *kq; struct proc *p = curproc; int ls; NET_ASSERT_UNLOCKED(); ls = klist_lock(list); while ((kn = SLIST_FIRST(&list->kl_list)) != NULL) { kq = kn->kn_kq; mtx_enter(&kq->kq_lock); if (!knote_acquire(kn, list, ls)) { /* knote_acquire() has released kq_lock * and klist lock. */ ls = klist_lock(list); continue; } mtx_leave(&kq->kq_lock); klist_unlock(list, ls); filter_detach(kn); if (kn->kn_fop->f_flags & FILTEROP_ISFD) { kn->kn_fop = &dead_filtops; filter_event(kn, 0); mtx_enter(&kq->kq_lock); knote_activate(kn); knote_release(kn); mtx_leave(&kq->kq_lock); } else { knote_drop(kn, p); } ls = klist_lock(list); } klist_unlock(list, ls); } static int klist_lock(struct klist *list) { int ls = 0; if (list->kl_ops != NULL) { ls = list->kl_ops->klo_lock(list->kl_arg); } else { KERNEL_LOCK(); ls = splhigh(); } return ls; } static void klist_unlock(struct klist *list, int ls) { if (list->kl_ops != NULL) { list->kl_ops->klo_unlock(list->kl_arg, ls); } else { splx(ls); KERNEL_UNLOCK(); } } static void klist_mutex_assertlk(void *arg) { struct mutex *mtx = arg; (void)mtx; MUTEX_ASSERT_LOCKED(mtx); } static int klist_mutex_lock(void *arg) { struct mutex *mtx = arg; mtx_enter(mtx); return 0; } static void klist_mutex_unlock(void *arg, int s) { struct mutex *mtx = arg; mtx_leave(mtx); } static const struct klistops mutex_klistops = { .klo_assertlk = klist_mutex_assertlk, .klo_lock = klist_mutex_lock, .klo_unlock = klist_mutex_unlock, }; void klist_init_mutex(struct klist *klist, struct mutex *mtx) { klist_init(klist, &mutex_klistops, mtx); } static void klist_rwlock_assertlk(void *arg) { struct rwlock *rwl = arg; (void)rwl; rw_assert_wrlock(rwl); } static int klist_rwlock_lock(void *arg) { struct rwlock *rwl = arg; rw_enter_write(rwl); return 0; } static void klist_rwlock_unlock(void *arg, int s) { struct rwlock *rwl = arg; rw_exit_write(rwl); } static const struct klistops rwlock_klistops = { .klo_assertlk = klist_rwlock_assertlk, .klo_lock = klist_rwlock_lock, .klo_unlock = klist_rwlock_unlock, }; void klist_init_rwlock(struct klist *klist, struct rwlock *rwl) { klist_init(klist, &rwlock_klistops, rwl); }
20 5 4 3 3 3 3 3 3 3 1 1 20 2 18 1 1 16 14 1 1 4 13 12 2 10 2 8 4 4 4 4 2 6 34 20 14 29 5 8 27 20 2 9 3 12 9 6 11 19 1 41 1 2 18 8 9 5 30 4 6 7 7 2 55 55 55 55 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 /* $OpenBSD: kern_time.c,v 1.157 2022/08/14 01:58:27 jsg Exp $ */ /* $NetBSD: kern_time.c,v 1.20 1996/02/18 11:57:06 fvdl Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_time.c 8.4 (Berkeley) 5/26/95 */ #include <sys/param.h> #include <sys/kernel.h> #include <sys/systm.h> #include <sys/mutex.h> #include <sys/rwlock.h> #include <sys/proc.h> #include <sys/ktrace.h> #include <sys/signalvar.h> #include <sys/stdint.h> #include <sys/pledge.h> #include <sys/task.h> #include <sys/timeout.h> #include <sys/timetc.h> #include <sys/mount.h> #include <sys/syscallargs.h> #include <dev/clock_subr.h> int itimerfix(struct itimerval *); /* * Time of day and interval timer support. * * These routines provide the kernel entry points to get and set * the time-of-day and per-process interval timers. Subroutines * here provide support for adding and subtracting timeval structures * and decrementing interval timers, optionally reloading the interval * timers when they expire. */ /* This function is used by clock_settime and settimeofday */ int settime(const struct timespec *ts) { struct timespec now; /* * Don't allow the time to be set forward so far it will wrap * and become negative, thus allowing an attacker to bypass * the next check below. The cutoff is 1 year before rollover * occurs, so even if the attacker uses adjtime(2) to move * the time past the cutoff, it will take a very long time * to get to the wrap point. * * XXX: we check against UINT_MAX until we can figure out * how to deal with the hardware RTCs. */ if (ts->tv_sec > UINT_MAX - 365*24*60*60) { printf("denied attempt to set clock forward to %lld\n", (long long)ts->tv_sec); return (EPERM); } /* * If the system is secure, we do not allow the time to be * set to an earlier value (it may be slowed using adjtime, * but not set back). This feature prevent interlopers from * setting arbitrary time stamps on files. */ nanotime(&now); if (securelevel > 1 && timespeccmp(ts, &now, <=)) { printf("denied attempt to set clock back %lld seconds\n", (long long)now.tv_sec - ts->tv_sec); return (EPERM); } tc_setrealtimeclock(ts); KERNEL_LOCK(); resettodr(); KERNEL_UNLOCK(); return (0); } int clock_gettime(struct proc *p, clockid_t clock_id, struct timespec *tp) { struct proc *q; int error = 0; switch (clock_id) { case CLOCK_REALTIME: nanotime(tp); break; case CLOCK_UPTIME: nanoruntime(tp); break; case CLOCK_MONOTONIC: case CLOCK_BOOTTIME: nanouptime(tp); break; case CLOCK_PROCESS_CPUTIME_ID: nanouptime(tp); timespecsub(tp, &curcpu()->ci_schedstate.spc_runtime, tp); timespecadd(tp, &p->p_p->ps_tu.tu_runtime, tp); timespecadd(tp, &p->p_rtime, tp); break; case CLOCK_THREAD_CPUTIME_ID: nanouptime(tp); timespecsub(tp, &curcpu()->ci_schedstate.spc_runtime, tp); timespecadd(tp, &p->p_tu.tu_runtime, tp); timespecadd(tp, &p->p_rtime, tp); break; default: /* check for clock from pthread_getcpuclockid() */ if (__CLOCK_TYPE(clock_id) == CLOCK_THREAD_CPUTIME_ID) { KERNEL_LOCK(); q = tfind(__CLOCK_PTID(clock_id) - THREAD_PID_OFFSET); if (q == NULL || q->p_p != p->p_p) error = ESRCH; else *tp = q->p_tu.tu_runtime; KERNEL_UNLOCK(); } else error = EINVAL; break; } return (error); } int sys_clock_gettime(struct proc *p, void *v, register_t *retval) { struct sys_clock_gettime_args /* { syscallarg(clockid_t) clock_id; syscallarg(struct timespec *) tp; } */ *uap = v; struct timespec ats; int error; memset(&ats, 0, sizeof(ats)); if ((error = clock_gettime(p, SCARG(uap, clock_id), &ats)) != 0) return (error); error = copyout(&ats, SCARG(uap, tp), sizeof(ats)); #ifdef KTRACE if (error == 0 && KTRPOINT(p, KTR_STRUCT)) ktrabstimespec(p, &ats); #endif return (error); } int sys_clock_settime(struct proc *p, void *v, register_t *retval) { struct sys_clock_settime_args /* { syscallarg(clockid_t) clock_id; syscallarg(const struct timespec *) tp; } */ *uap = v; struct timespec ats; clockid_t clock_id; int error; if ((error = suser(p)) != 0) return (error); if ((error = copyin(SCARG(uap, tp), &ats, sizeof(ats))) != 0) return (error); clock_id = SCARG(uap, clock_id); switch (clock_id) { case CLOCK_REALTIME: if (!timespecisvalid(&ats)) return (EINVAL); if ((error = settime(&ats)) != 0) return (error); break; default: /* Other clocks are read-only */ return (EINVAL); } return (0); } int sys_clock_getres(struct proc *p, void *v, register_t *retval) { struct sys_clock_getres_args /* { syscallarg(clockid_t) clock_id; syscallarg(struct timespec *) tp; } */ *uap = v; clockid_t clock_id; struct bintime bt; struct timespec ts; struct proc *q; u_int64_t scale; int error = 0, realstathz; memset(&ts, 0, sizeof(ts)); realstathz = (stathz == 0) ? hz : stathz; clock_id = SCARG(uap, clock_id); switch (clock_id) { case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_BOOTTIME: case CLOCK_UPTIME: memset(&bt, 0, sizeof(bt)); rw_enter_read(&tc_lock); scale = ((1ULL << 63) / tc_getfrequency()) * 2; bt.frac = tc_getprecision() * scale; rw_exit_read(&tc_lock); BINTIME_TO_TIMESPEC(&bt, &ts); break; case CLOCK_PROCESS_CPUTIME_ID: case CLOCK_THREAD_CPUTIME_ID: ts.tv_nsec = 1000000000 / realstathz; break; default: /* check for clock from pthread_getcpuclockid() */ if (__CLOCK_TYPE(clock_id) == CLOCK_THREAD_CPUTIME_ID) { KERNEL_LOCK(); q = tfind(__CLOCK_PTID(clock_id) - THREAD_PID_OFFSET); if (q == NULL || q->p_p != p->p_p) error = ESRCH; else ts.tv_nsec = 1000000000 / realstathz; KERNEL_UNLOCK(); } else error = EINVAL; break; } if (error == 0 && SCARG(uap, tp)) { ts.tv_nsec = MAX(ts.tv_nsec, 1); error = copyout(&ts, SCARG(uap, tp), sizeof(ts)); #ifdef KTRACE if (error == 0 && KTRPOINT(p, KTR_STRUCT)) ktrreltimespec(p, &ts); #endif } return error; } int sys_nanosleep(struct proc *p, void *v, register_t *retval) { static int chan; struct sys_nanosleep_args/* { syscallarg(const struct timespec *) rqtp; syscallarg(struct timespec *) rmtp; } */ *uap = v; struct timespec elapsed, remainder, request, start, stop; uint64_t nsecs; struct timespec *rmtp; int copyout_error, error; rmtp = SCARG(uap, rmtp); error = copyin(SCARG(uap, rqtp), &request, sizeof(request)); if (error) return (error); #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktrreltimespec(p, &request); #endif if (request.tv_sec < 0 || !timespecisvalid(&request)) return (EINVAL); do { getnanouptime(&start); nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(&request), MAXTSLP)); error = tsleep_nsec(&chan, PWAIT | PCATCH, "nanoslp", nsecs); getnanouptime(&stop); timespecsub(&stop, &start, &elapsed); timespecsub(&request, &elapsed, &request); if (request.tv_sec < 0) timespecclear(&request); if (error != EWOULDBLOCK) break; } while (timespecisset(&request)); if (error == ERESTART) error = EINTR; if (error == EWOULDBLOCK) error = 0; if (rmtp) { memset(&remainder, 0, sizeof(remainder)); remainder = request; copyout_error = copyout(&remainder, rmtp, sizeof(remainder)); if (copyout_error) error = copyout_error; #ifdef KTRACE if (copyout_error == 0 && KTRPOINT(p, KTR_STRUCT)) ktrreltimespec(p, &remainder); #endif } return error; } int sys_gettimeofday(struct proc *p, void *v, register_t *retval) { struct sys_gettimeofday_args /* { syscallarg(struct timeval *) tp; syscallarg(struct timezone *) tzp; } */ *uap = v; struct timeval atv; static const struct timezone zerotz = { 0, 0 }; struct timeval *tp; struct timezone *tzp; int error = 0; tp = SCARG(uap, tp); tzp = SCARG(uap, tzp); if (tp) { memset(&atv, 0, sizeof(atv)); microtime(&atv); if ((error = copyout(&atv, tp, sizeof (atv)))) return (error); #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktrabstimeval(p, &atv); #endif } if (tzp) error = copyout(&zerotz, tzp, sizeof(zerotz)); return (error); } int sys_settimeofday(struct proc *p, void *v, register_t *retval) { struct sys_settimeofday_args /* { syscallarg(const struct timeval *) tv; syscallarg(const struct timezone *) tzp; } */ *uap = v; struct timezone atz; struct timeval atv; const struct timeval *tv; const struct timezone *tzp; int error; tv = SCARG(uap, tv); tzp = SCARG(uap, tzp); if ((error = suser(p))) return (error); /* Verify all parameters before changing time. */ if (tv && (error = copyin(tv, &atv, sizeof(atv)))) return (error); if (tzp && (error = copyin(tzp, &atz, sizeof(atz)))) return (error); if (tv) { struct timespec ts; #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktrabstimeval(p, &atv); #endif if (!timerisvalid(&atv)) return (EINVAL); TIMEVAL_TO_TIMESPEC(&atv, &ts); if ((error = settime(&ts)) != 0) return (error); } return (0); } #define ADJFREQ_MAX (500000000LL << 32) #define ADJFREQ_MIN (-ADJFREQ_MAX) int sys_adjfreq(struct proc *p, void *v, register_t *retval) { struct sys_adjfreq_args /* { syscallarg(const int64_t *) freq; syscallarg(int64_t *) oldfreq; } */ *uap = v; int error = 0; int64_t f, oldf; const int64_t *freq = SCARG(uap, freq); int64_t *oldfreq = SCARG(uap, oldfreq); if (freq) { if ((error = suser(p))) return (error); if ((error = copyin(freq, &f, sizeof(f)))) return (error); if (f < ADJFREQ_MIN || f > ADJFREQ_MAX) return (EINVAL); } rw_enter(&tc_lock, (freq == NULL) ? RW_READ : RW_WRITE); if (oldfreq) { tc_adjfreq(&oldf, NULL); if ((error = copyout(&oldf, oldfreq, sizeof(oldf)))) goto out; } if (freq) tc_adjfreq(NULL, &f); out: rw_exit(&tc_lock); return (error); } int sys_adjtime(struct proc *p, void *v, register_t *retval) { struct sys_adjtime_args /* { syscallarg(const struct timeval *) delta; syscallarg(struct timeval *) olddelta; } */ *uap = v; struct timeval atv; const struct timeval *delta = SCARG(uap, delta); struct timeval *olddelta = SCARG(uap, olddelta); int64_t adjustment, remaining; int error; error = pledge_adjtime(p, delta); if (error) return error; if (delta) { if ((error = suser(p))) return (error); if ((error = copyin(delta, &atv, sizeof(struct timeval)))) return (error); #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktrreltimeval(p, &atv); #endif if (!timerisvalid(&atv)) return (EINVAL); if (atv.tv_sec > INT64_MAX / 1000000) return EINVAL; if (atv.tv_sec < INT64_MIN / 1000000) return EINVAL; adjustment = atv.tv_sec * 1000000; if (adjustment > INT64_MAX - atv.tv_usec) return EINVAL; adjustment += atv.tv_usec; rw_enter_write(&tc_lock); } if (olddelta) { tc_adjtime(&remaining, NULL); memset(&atv, 0, sizeof(atv)); atv.tv_sec = remaining / 1000000; atv.tv_usec = remaining % 1000000; if (atv.tv_usec < 0) { atv.tv_usec += 1000000; atv.tv_sec--; } if ((error = copyout(&atv, olddelta, sizeof(struct timeval)))) goto out; } if (delta) tc_adjtime(NULL, &adjustment); out: if (delta) rw_exit_write(&tc_lock); return (error); } struct mutex itimer_mtx = MUTEX_INITIALIZER(IPL_CLOCK); /* * Get or set value of an interval timer. The process virtual and * profiling virtual time timers are kept internally in the * way they are specified externally: in time until they expire. * * The real time interval timer's it_value, in contrast, is kept as an * absolute time rather than as a delta, so that it is easy to keep * periodic real-time signals from drifting. * * Virtual time timers are processed in the hardclock() routine of * kern_clock.c. The real time timer is processed by a timeout * routine, called from the softclock() routine. Since a callout * may be delayed in real time due to interrupt processing in the system, * it is possible for the real time timeout routine (realitexpire, given below), * to be delayed in real time past when it is supposed to occur. It * does not suffice, therefore, to reload the real timer .it_value from the * real time timers .it_interval. Rather, we compute the next time in * absolute time the timer should go off. */ void setitimer(int which, const struct itimerval *itv, struct itimerval *olditv) { struct itimerspec its, oldits; struct timespec now; struct itimerspec *itimer; struct process *pr; KASSERT(which >= ITIMER_REAL && which <= ITIMER_PROF); pr = curproc->p_p; itimer = &pr->ps_timer[which]; if (itv != NULL) { TIMEVAL_TO_TIMESPEC(&itv->it_value, &its.it_value); TIMEVAL_TO_TIMESPEC(&itv->it_interval, &its.it_interval); } if (which == ITIMER_REAL) { mtx_enter(&pr->ps_mtx); nanouptime(&now); } else mtx_enter(&itimer_mtx); if (olditv != NULL) oldits = *itimer; if (itv != NULL) { if (which == ITIMER_REAL) { if (timespecisset(&its.it_value)) { timespecadd(&its.it_value, &now, &its.it_value); timeout_at_ts(&pr->ps_realit_to, &its.it_value); } else timeout_del(&pr->ps_realit_to); } *itimer = its; } if (which == ITIMER_REAL) mtx_leave(&pr->ps_mtx); else mtx_leave(&itimer_mtx); if (olditv != NULL) { if (which == ITIMER_REAL && timespecisset(&oldits.it_value)) { if (timespeccmp(&oldits.it_value, &now, <)) timespecclear(&oldits.it_value); else { timespecsub(&oldits.it_value, &now, &oldits.it_value); } } TIMESPEC_TO_TIMEVAL(&olditv->it_value, &oldits.it_value); TIMESPEC_TO_TIMEVAL(&olditv->it_interval, &oldits.it_interval); } } void cancel_all_itimers(void) { struct itimerval itv; int i; timerclear(&itv.it_value); timerclear(&itv.it_interval); for (i = 0; i < nitems(curproc->p_p->ps_timer); i++) setitimer(i, &itv, NULL); } int sys_getitimer(struct proc *p, void *v, register_t *retval) { struct sys_getitimer_args /* { syscallarg(int) which; syscallarg(struct itimerval *) itv; } */ *uap = v; struct itimerval aitv; int which; which = SCARG(uap, which); if (which < ITIMER_REAL || which > ITIMER_PROF) return EINVAL; memset(&aitv, 0, sizeof(aitv)); setitimer(which, NULL, &aitv); return copyout(&aitv, SCARG(uap, itv), sizeof(aitv)); } int sys_setitimer(struct proc *p, void *v, register_t *retval) { struct sys_setitimer_args /* { syscallarg(int) which; syscallarg(const struct itimerval *) itv; syscallarg(struct itimerval *) oitv; } */ *uap = v; struct itimerval aitv, olditv; struct itimerval *newitvp, *olditvp; int error, which; which = SCARG(uap, which); if (which < ITIMER_REAL || which > ITIMER_PROF) return EINVAL; newitvp = olditvp = NULL; if (SCARG(uap, itv) != NULL) { error = copyin(SCARG(uap, itv), &aitv, sizeof(aitv)); if (error) return error; error = itimerfix(&aitv); if (error) return error; newitvp = &aitv; } if (SCARG(uap, oitv) != NULL) { memset(&olditv, 0, sizeof(olditv)); olditvp = &olditv; } if (newitvp == NULL && olditvp == NULL) return 0; setitimer(which, newitvp, olditvp); if (SCARG(uap, oitv) != NULL) return copyout(&olditv, SCARG(uap, oitv), sizeof(olditv)); return 0; } /* * Real interval timer expired: * send process whose timer expired an alarm signal. * If time is not set up to reload, then just return. * Else compute next time timer should go off which is > current time. * This is where delay in processing this timeout causes multiple * SIGALRM calls to be compressed into one. */ void realitexpire(void *arg) { struct timespec cts; struct process *pr = arg; struct itimerspec *tp = &pr->ps_timer[ITIMER_REAL]; int need_signal = 0; mtx_enter(&pr->ps_mtx); /* * Do nothing if the timer was cancelled or rescheduled while we * were entering the mutex. */ if (!timespecisset(&tp->it_value) || timeout_pending(&pr->ps_realit_to)) goto out; /* The timer expired. We need to send the signal. */ need_signal = 1; /* One-shot timers are not reloaded. */ if (!timespecisset(&tp->it_interval)) { timespecclear(&tp->it_value); goto out; } /* * Find the nearest future expiration point and restart * the timeout. */ nanouptime(&cts); while (timespeccmp(&tp->it_value, &cts, <=)) timespecadd(&tp->it_value, &tp->it_interval, &tp->it_value); if ((pr->ps_flags & PS_EXITING) == 0) timeout_at_ts(&pr->ps_realit_to, &tp->it_value); out: mtx_leave(&pr->ps_mtx); if (need_signal) prsignal(pr, SIGALRM); } /* * Check if the given setitimer(2) input is valid. Clear it_interval * if it_value is unset. Round it_interval up to the minimum interval * if necessary. */ int itimerfix(struct itimerval *itv) { static const struct timeval max = { .tv_sec = UINT_MAX, .tv_usec = 0 }; struct timeval min_interval = { .tv_sec = 0, .tv_usec = tick }; if (itv->it_value.tv_sec < 0 || !timerisvalid(&itv->it_value)) return EINVAL; if (timercmp(&itv->it_value, &max, >)) return EINVAL; if (itv->it_interval.tv_sec < 0 || !timerisvalid(&itv->it_interval)) return EINVAL; if (timercmp(&itv->it_interval, &max, >)) return EINVAL; if (!timerisset(&itv->it_value)) timerclear(&itv->it_interval); if (timerisset(&itv->it_interval)) { if (timercmp(&itv->it_interval, &min_interval, <)) itv->it_interval = min_interval; } return 0; } /* * Decrement an interval timer by the given number of nanoseconds. * If the timer expires and it is periodic then reload it. When reloading * the timer we subtract any overrun from the next period so that the timer * does not drift. */ int itimerdecr(struct itimerspec *itp, long nsec) { struct timespec decrement; NSEC_TO_TIMESPEC(nsec, &decrement); mtx_enter(&itimer_mtx); /* * Double-check that the timer is enabled. A different thread * in setitimer(2) may have disabled it while we were entering * the mutex. */ if (!timespecisset(&itp->it_value)) { mtx_leave(&itimer_mtx); return (1); } /* * The timer is enabled. Update and reload it as needed. */ timespecsub(&itp->it_value, &decrement, &itp->it_value); if (itp->it_value.tv_sec >= 0 && timespecisset(&itp->it_value)) { mtx_leave(&itimer_mtx); return (1); } if (!timespecisset(&itp->it_interval)) { timespecclear(&itp->it_value); mtx_leave(&itimer_mtx); return (0); } while (itp->it_value.tv_sec < 0 || !timespecisset(&itp->it_value)) timespecadd(&itp->it_value, &itp->it_interval, &itp->it_value); mtx_leave(&itimer_mtx); return (0); } struct mutex ratecheck_mtx = MUTEX_INITIALIZER(IPL_HIGH); /* * ratecheck(): simple time-based rate-limit checking. see ratecheck(9) * for usage and rationale. */ int ratecheck(struct timeval *lasttime, const struct timeval *mininterval) { struct timeval tv, delta; int rv = 0; getmicrouptime(&tv); mtx_enter(&ratecheck_mtx); timersub(&tv, lasttime, &delta); /* * check for 0,0 is so that the message will be seen at least once, * even if interval is huge. */ if (timercmp(&delta, mininterval, >=) || (lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) { *lasttime = tv; rv = 1; } mtx_leave(&ratecheck_mtx); return (rv); } struct mutex ppsratecheck_mtx = MUTEX_INITIALIZER(IPL_HIGH); /* * ppsratecheck(): packets (or events) per second limitation. */ int ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps) { struct timeval tv, delta; int rv; microuptime(&tv); mtx_enter(&ppsratecheck_mtx); timersub(&tv, lasttime, &delta); /* * check for 0,0 is so that the message will be seen at least once. * if more than one second have passed since the last update of * lasttime, reset the counter. * * we do increment *curpps even in *curpps < maxpps case, as some may * try to use *curpps for stat purposes as well. */ if (maxpps == 0) rv = 0; else if ((lasttime->tv_sec == 0 && lasttime->tv_usec == 0) || delta.tv_sec >= 1) { *lasttime = tv; *curpps = 0; rv = 1; } else if (maxpps < 0) rv = 1; else if (*curpps < maxpps) rv = 1; else rv = 0; /* be careful about wrap-around */ if (*curpps + 1 > *curpps) *curpps = *curpps + 1; mtx_leave(&ppsratecheck_mtx); return (rv); } todr_chip_handle_t todr_handle; int inittodr_done; #define MINYEAR ((OpenBSD / 100) - 1) /* minimum plausible year */ /* * inittodr: * * Initialize time from the time-of-day register. */ void inittodr(time_t base) { time_t deltat; struct timeval rtctime; struct timespec ts; int badbase; inittodr_done = 1; if (base < (MINYEAR - 1970) * SECYR) { printf("WARNING: preposterous time in file system\n"); /* read the system clock anyway */ base = (MINYEAR - 1970) * SECYR; badbase = 1; } else badbase = 0; rtctime.tv_sec = base; rtctime.tv_usec = 0; if (todr_handle == NULL || todr_gettime(todr_handle, &rtctime) != 0 || rtctime.tv_sec < (MINYEAR - 1970) * SECYR) { /* * Believe the time in the file system for lack of * anything better, resetting the TODR. */ rtctime.tv_sec = base; rtctime.tv_usec = 0; if (todr_handle != NULL && !badbase) printf("WARNING: bad clock chip time\n"); ts.tv_sec = rtctime.tv_sec; ts.tv_nsec = rtctime.tv_usec * 1000; tc_setclock(&ts); goto bad; } else { ts.tv_sec = rtctime.tv_sec; ts.tv_nsec = rtctime.tv_usec * 1000; tc_setclock(&ts); } if (!badbase) { /* * See if we gained/lost two or more days; if * so, assume something is amiss. */ deltat = rtctime.tv_sec - base; if (deltat < 0) deltat = -deltat; if (deltat < 2 * SECDAY) return; /* all is well */ #ifndef SMALL_KERNEL printf("WARNING: clock %s %lld days\n", rtctime.tv_sec < base ? "lost" : "gained", (long long)(deltat / SECDAY)); #endif } bad: printf("WARNING: CHECK AND RESET THE DATE!\n"); } /* * resettodr: * * Reset the time-of-day register with the current time. */ void resettodr(void) { struct timeval rtctime; /* * Skip writing the RTC if inittodr(9) never ran. We don't * want to overwrite a reasonable value with a nonsense value. */ if (!inittodr_done) return; microtime(&rtctime); if (todr_handle != NULL && todr_settime(todr_handle, &rtctime) != 0) printf("WARNING: can't update clock chip time\n"); } void todr_attach(struct todr_chip_handle *todr) { todr_handle = todr; } #define RESETTODR_PERIOD 1800 void periodic_resettodr(void *); void perform_resettodr(void *); struct timeout resettodr_to = TIMEOUT_INITIALIZER(periodic_resettodr, NULL); struct task resettodr_task = TASK_INITIALIZER(perform_resettodr, NULL); void periodic_resettodr(void *arg __unused) { task_add(systq, &resettodr_task); } void perform_resettodr(void *arg __unused) { resettodr(); timeout_add_sec(&resettodr_to, RESETTODR_PERIOD); } void start_periodic_resettodr(void) { timeout_add_sec(&resettodr_to, RESETTODR_PERIOD); } void stop_periodic_resettodr(void) { timeout_del(&resettodr_to); task_del(systq, &resettodr_task); }
606 1411 588 3512 2075 1965 2085 2087 2083 3298 1038 3511 2087 2087 2085 2086 2088 2048 652 2086 3294 3290 1740 1856 1738 1741 2 1738 1739 1739 1742 1741 1742 1680 640 1677 65 29 64 65 65 64 64 64 65 2 2 2 2 2 63 40 29 5 61 2 734 599 24 21 2 1 4 11 3 5 3 60 2 7 4 3 190 1 35 145 27 3 1967 1968 144 35 1964 1854 1856 198 5 4 1 1 1 2 1 3 1 1 1 1 2219 191 2306 2047 2047 67 67 73 73 71 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 /* $OpenBSD: subr_pool.c,v 1.236 2022/08/14 01:58:28 jsg Exp $ */ /* $NetBSD: subr_pool.c,v 1.61 2001/09/26 07:14:56 chs Exp $ */ /*- * Copyright (c) 1997, 1999, 2000 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace * Simulation Facility, NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/param.h> #include <sys/systm.h> #include <sys/errno.h> #include <sys/malloc.h> #include <sys/pool.h> #include <sys/proc.h> #include <sys/sysctl.h> #include <sys/task.h> #include <sys/time.h> #include <sys/timeout.h> #include <sys/percpu.h> #include <sys/tracepoint.h> #include <uvm/uvm_extern.h> /* * Pool resource management utility. * * Memory is allocated in pages which are split into pieces according to * the pool item size. Each page is kept on one of three lists in the * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages', * for empty, full and partially-full pages respectively. The individual * pool items are on a linked list headed by `ph_items' in each page * header. The memory for building the page list is either taken from * the allocated pages themselves (for small pool items) or taken from * an internal pool of page headers (`phpool'). */ /* List of all pools */ SIMPLEQ_HEAD(,pool) pool_head = SIMPLEQ_HEAD_INITIALIZER(pool_head); /* * Every pool gets a unique serial number assigned to it. If this counter * wraps, we're screwed, but we shouldn't create so many pools anyway. */ unsigned int pool_serial; unsigned int pool_count; /* Lock the previous variables making up the global pool state */ struct rwlock pool_lock = RWLOCK_INITIALIZER("pools"); /* Private pool for page header structures */ struct pool phpool; struct pool_lock_ops { void (*pl_init)(struct pool *, union pool_lock *, const struct lock_type *); void (*pl_enter)(union pool_lock *); int (*pl_enter_try)(union pool_lock *); void (*pl_leave)(union pool_lock *); void (*pl_assert_locked)(union pool_lock *); void (*pl_assert_unlocked)(union pool_lock *); int (*pl_sleep)(void *, union pool_lock *, int, const char *); }; static const struct pool_lock_ops pool_lock_ops_mtx; static const struct pool_lock_ops pool_lock_ops_rw; #ifdef WITNESS #define pl_init(pp, pl) do { \ static const struct lock_type __lock_type = { .lt_name = #pl }; \ (pp)->pr_lock_ops->pl_init(pp, pl, &__lock_type); \ } while (0) #else /* WITNESS */ #define pl_init(pp, pl) (pp)->pr_lock_ops->pl_init(pp, pl, NULL) #endif /* WITNESS */ static inline void pl_enter(struct pool *pp, union pool_lock *pl) { pp->pr_lock_ops->pl_enter(pl); } static inline int pl_enter_try(struct pool *pp, union pool_lock *pl) { return pp->pr_lock_ops->pl_enter_try(pl); } static inline void pl_leave(struct pool *pp, union pool_lock *pl) { pp->pr_lock_ops->pl_leave(pl); } static inline void pl_assert_locked(struct pool *pp, union pool_lock *pl) { pp->pr_lock_ops->pl_assert_locked(pl); } static inline void pl_assert_unlocked(struct pool *pp, union pool_lock *pl) { pp->pr_lock_ops->pl_assert_unlocked(pl); } static inline int pl_sleep(struct pool *pp, void *ident, union pool_lock *lock, int priority, const char *wmesg) { return pp->pr_lock_ops->pl_sleep(ident, lock, priority, wmesg); } struct pool_item { u_long pi_magic; XSIMPLEQ_ENTRY(pool_item) pi_list; }; #define POOL_IMAGIC(ph, pi) ((u_long)(pi) ^ (ph)->ph_magic) struct pool_page_header { /* Page headers */ TAILQ_ENTRY(pool_page_header) ph_entry; /* pool page list */ XSIMPLEQ_HEAD(, pool_item) ph_items; /* free items on the page */ RBT_ENTRY(pool_page_header) ph_node; /* off-page page headers */ unsigned int ph_nmissing; /* # of chunks in use */ caddr_t ph_page; /* this page's address */ caddr_t ph_colored; /* page's colored address */ unsigned long ph_magic; uint64_t ph_timestamp; }; #define POOL_MAGICBIT (1 << 3) /* keep away from perturbed low bits */ #define POOL_PHPOISON(ph) ISSET((ph)->ph_magic, POOL_MAGICBIT) #ifdef MULTIPROCESSOR struct pool_cache_item { struct pool_cache_item *ci_next; /* next item in list */ unsigned long ci_nitems; /* number of items in list */ TAILQ_ENTRY(pool_cache_item) ci_nextl; /* entry in list of lists */ }; /* we store whether the cached item is poisoned in the high bit of nitems */ #define POOL_CACHE_ITEM_NITEMS_MASK 0x7ffffffUL #define POOL_CACHE_ITEM_NITEMS_POISON 0x8000000UL #define POOL_CACHE_ITEM_NITEMS(_ci) \ ((_ci)->ci_nitems & POOL_CACHE_ITEM_NITEMS_MASK) #define POOL_CACHE_ITEM_POISONED(_ci) \ ISSET((_ci)->ci_nitems, POOL_CACHE_ITEM_NITEMS_POISON) struct pool_cache { struct pool_cache_item *pc_actv; /* active list of items */ unsigned long pc_nactv; /* actv head nitems cache */ struct pool_cache_item *pc_prev; /* previous list of items */ uint64_t pc_gen; /* generation number */ uint64_t pc_nget; /* # of successful requests */ uint64_t pc_nfail; /* # of unsuccessful reqs */ uint64_t pc_nput; /* # of releases */ uint64_t pc_nlget; /* # of list requests */ uint64_t pc_nlfail; /* # of fails getting a list */ uint64_t pc_nlput; /* # of list releases */ int pc_nout; }; void *pool_cache_get(struct pool *); void pool_cache_put(struct pool *, void *); void pool_cache_destroy(struct pool *); void pool_cache_gc(struct pool *); #endif void pool_cache_pool_info(struct pool *, struct kinfo_pool *); int pool_cache_info(struct pool *, void *, size_t *); int pool_cache_cpus_info(struct pool *, void *, size_t *); #ifdef POOL_DEBUG int pool_debug = 1; #else int pool_debug = 0; #endif #define POOL_INPGHDR(pp) ((pp)->pr_phoffset != 0) struct pool_page_header * pool_p_alloc(struct pool *, int, int *); void pool_p_insert(struct pool *, struct pool_page_header *); void pool_p_remove(struct pool *, struct pool_page_header *); void pool_p_free(struct pool *, struct pool_page_header *); void pool_update_curpage(struct pool *); void *pool_do_get(struct pool *, int, int *); void pool_do_put(struct pool *, void *); int pool_chk_page(struct pool *, struct pool_page_header *, int); int pool_chk(struct pool *); void pool_get_done(struct pool *, void *, void *); void pool_runqueue(struct pool *, int); void *pool_allocator_alloc(struct pool *, int, int *); void pool_allocator_free(struct pool *, void *); /* * The default pool allocator. */ void *pool_page_alloc(struct pool *, int, int *); void pool_page_free(struct pool *, void *); /* * safe for interrupts; this is the default allocator */ struct pool_allocator pool_allocator_single = { pool_page_alloc, pool_page_free, POOL_ALLOC_SIZE(PAGE_SIZE, POOL_ALLOC_ALIGNED) }; void *pool_multi_alloc(struct pool *, int, int *); void pool_multi_free(struct pool *, void *); struct pool_allocator pool_allocator_multi = { pool_multi_alloc, pool_multi_free, POOL_ALLOC_SIZES(PAGE_SIZE, (1UL << 31), POOL_ALLOC_ALIGNED) }; void *pool_multi_alloc_ni(struct pool *, int, int *); void pool_multi_free_ni(struct pool *, void *); struct pool_allocator pool_allocator_multi_ni = { pool_multi_alloc_ni, pool_multi_free_ni, POOL_ALLOC_SIZES(PAGE_SIZE, (1UL << 31), POOL_ALLOC_ALIGNED) }; #ifdef DDB void pool_print_pagelist(struct pool_pagelist *, int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); void pool_print1(struct pool *, const char *, int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); #endif /* stale page garbage collectors */ void pool_gc_sched(void *); struct timeout pool_gc_tick = TIMEOUT_INITIALIZER(pool_gc_sched, NULL); void pool_gc_pages(void *); struct task pool_gc_task = TASK_INITIALIZER(pool_gc_pages, NULL); #define POOL_WAIT_FREE SEC_TO_NSEC(1) #define POOL_WAIT_GC SEC_TO_NSEC(8) RBT_PROTOTYPE(phtree, pool_page_header, ph_node, phtree_compare); static inline int phtree_compare(const struct pool_page_header *a, const struct pool_page_header *b) { vaddr_t va = (vaddr_t)a->ph_page; vaddr_t vb = (vaddr_t)b->ph_page; /* the compares in this order are important for the NFIND to work */ if (vb < va) return (-1); if (vb > va) return (1); return (0); } RBT_GENERATE(phtree, pool_page_header, ph_node, phtree_compare); /* * Return the pool page header based on page address. */ static inline struct pool_page_header * pr_find_pagehead(struct pool *pp, void *v) { struct pool_page_header *ph, key; if (POOL_INPGHDR(pp)) { caddr_t page; page = (caddr_t)((vaddr_t)v & pp->pr_pgmask); return ((struct pool_page_header *)(page + pp->pr_phoffset)); } key.ph_page = v; ph = RBT_NFIND(phtree, &pp->pr_phtree, &key); if (ph == NULL) panic("%s: %s: page header missing", __func__, pp->pr_wchan); KASSERT(ph->ph_page <= (caddr_t)v); if (ph->ph_page + pp->pr_pgsize <= (caddr_t)v) panic("%s: %s: incorrect page", __func__, pp->pr_wchan); return (ph); } /* * Initialize the given pool resource structure. * * We export this routine to allow other kernel parts to declare * static pools that must be initialized before malloc() is available. */ void pool_init(struct pool *pp, size_t size, u_int align, int ipl, int flags, const char *wchan, struct pool_allocator *palloc) { int off = 0, space; unsigned int pgsize = PAGE_SIZE, items; size_t pa_pagesz; #ifdef DIAGNOSTIC struct pool *iter; #endif if (align == 0) align = ALIGN(1); if (size < sizeof(struct pool_item)) size = sizeof(struct pool_item); size = roundup(size, align); while (size * 8 > pgsize) pgsize <<= 1; if (palloc == NULL) { if (pgsize > PAGE_SIZE) { palloc = ISSET(flags, PR_WAITOK) ? &pool_allocator_multi_ni : &pool_allocator_multi; } else palloc = &pool_allocator_single; pa_pagesz = palloc->pa_pagesz; } else { size_t pgsizes; pa_pagesz = palloc->pa_pagesz; if (pa_pagesz == 0) pa_pagesz = POOL_ALLOC_DEFAULT; pgsizes = pa_pagesz & ~POOL_ALLOC_ALIGNED; /* make sure the allocator can fit at least one item */ if (size > pgsizes) { panic("%s: pool %s item size 0x%zx > " "allocator %p sizes 0x%zx", __func__, wchan, size, palloc, pgsizes); } /* shrink pgsize until it fits into the range */ while (!ISSET(pgsizes, pgsize)) pgsize >>= 1; } KASSERT(ISSET(pa_pagesz, pgsize)); items = pgsize / size; /* * Decide whether to put the page header off page to avoid * wasting too large a part of the page. Off-page page headers * go into an RB tree, so we can match a returned item with * its header based on the page address. */ if (ISSET(pa_pagesz, POOL_ALLOC_ALIGNED)) { if (pgsize - (size * items) > sizeof(struct pool_page_header)) { off = pgsize - sizeof(struct pool_page_header); } else if (sizeof(struct pool_page_header) * 2 >= size) { off = pgsize - sizeof(struct pool_page_header); items = off / size; } } KASSERT(items > 0); /* * Initialize the pool structure. */ memset(pp, 0, sizeof(*pp)); if (ISSET(flags, PR_RWLOCK)) { KASSERT(flags & PR_WAITOK); pp->pr_lock_ops = &pool_lock_ops_rw; } else pp->pr_lock_ops = &pool_lock_ops_mtx; TAILQ_INIT(&pp->pr_emptypages); TAILQ_INIT(&pp->pr_fullpages); TAILQ_INIT(&pp->pr_partpages); pp->pr_curpage = NULL; pp->pr_npages = 0; pp->pr_minitems = 0; pp->pr_minpages = 0; pp->pr_maxpages = 8; pp->pr_size = size; pp->pr_pgsize = pgsize; pp->pr_pgmask = ~0UL ^ (pgsize - 1); pp->pr_phoffset = off; pp->pr_itemsperpage = items; pp->pr_wchan = wchan; pp->pr_alloc = palloc; pp->pr_nitems = 0; pp->pr_nout = 0; pp->pr_hardlimit = UINT_MAX; pp->pr_hardlimit_warning = NULL; pp->pr_hardlimit_ratecap.tv_sec = 0; pp->pr_hardlimit_ratecap.tv_usec = 0; pp->pr_hardlimit_warning_last.tv_sec = 0; pp->pr_hardlimit_warning_last.tv_usec = 0; RBT_INIT(phtree, &pp->pr_phtree); /* * Use the space between the chunks and the page header * for cache coloring. */ space = POOL_INPGHDR(pp) ? pp->pr_phoffset : pp->pr_pgsize; space -= pp->pr_itemsperpage * pp->pr_size; pp->pr_align = align; pp->pr_maxcolors = (space / align) + 1; pp->pr_nget = 0; pp->pr_nfail = 0; pp->pr_nput = 0; pp->pr_npagealloc = 0; pp->pr_npagefree = 0; pp->pr_hiwat = 0; pp->pr_nidle = 0; pp->pr_ipl = ipl; pp->pr_flags = flags; pl_init(pp, &pp->pr_lock); pl_init(pp, &pp->pr_requests_lock); TAILQ_INIT(&pp->pr_requests); if (phpool.pr_size == 0) { pool_init(&phpool, sizeof(struct pool_page_header), 0, IPL_HIGH, 0, "phpool", NULL); /* make sure phpool won't "recurse" */ KASSERT(POOL_INPGHDR(&phpool)); } /* pglistalloc/constraint parameters */ pp->pr_crange = &kp_dirty; /* Insert this into the list of all pools. */ rw_enter_write(&pool_lock); #ifdef DIAGNOSTIC SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) { if (iter == pp) panic("%s: pool %s already on list", __func__, wchan); } #endif pp->pr_serial = ++pool_serial; if (pool_serial == 0) panic("%s: too much uptime", __func__); SIMPLEQ_INSERT_HEAD(&pool_head, pp, pr_poollist); pool_count++; rw_exit_write(&pool_lock); } /* * Decommission a pool resource. */ void pool_destroy(struct pool *pp) { struct pool_page_header *ph; struct pool *prev, *iter; #ifdef MULTIPROCESSOR if (pp->pr_cache != NULL) pool_cache_destroy(pp); #endif #ifdef DIAGNOSTIC if (pp->pr_nout != 0) panic("%s: pool busy: still out: %u", __func__, pp->pr_nout); #endif /* Remove from global pool list */ rw_enter_write(&pool_lock); pool_count--; if (pp == SIMPLEQ_FIRST(&pool_head)) SIMPLEQ_REMOVE_HEAD(&pool_head, pr_poollist); else { prev = SIMPLEQ_FIRST(&pool_head); SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) { if (iter == pp) { SIMPLEQ_REMOVE_AFTER(&pool_head, prev, pr_poollist); break; } prev = iter; } } rw_exit_write(&pool_lock); /* Remove all pages */ while ((ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL) { pl_enter(pp, &pp->pr_lock); pool_p_remove(pp, ph); pl_leave(pp, &pp->pr_lock); pool_p_free(pp, ph); } KASSERT(TAILQ_EMPTY(&pp->pr_fullpages)); KASSERT(TAILQ_EMPTY(&pp->pr_partpages)); } void pool_request_init(struct pool_request *pr, void (*handler)(struct pool *, void *, void *), void *cookie) { pr->pr_handler = handler; pr->pr_cookie = cookie; pr->pr_item = NULL; } void pool_request(struct pool *pp, struct pool_request *pr) { pl_enter(pp, &pp->pr_requests_lock); TAILQ_INSERT_TAIL(&pp->pr_requests, pr, pr_entry); pool_runqueue(pp, PR_NOWAIT); pl_leave(pp, &pp->pr_requests_lock); } struct pool_get_memory { union pool_lock lock; void * volatile v; }; /* * Grab an item from the pool. */ void * pool_get(struct pool *pp, int flags) { void *v = NULL; int slowdown = 0; KASSERT(flags & (PR_WAITOK | PR_NOWAIT)); if (pp->pr_flags & PR_RWLOCK) KASSERT(flags & PR_WAITOK); #ifdef MULTIPROCESSOR if (pp->pr_cache != NULL) { v = pool_cache_get(pp); if (v != NULL) goto good; } #endif pl_enter(pp, &pp->pr_lock); if (pp->pr_nout >= pp->pr_hardlimit) { if (ISSET(flags, PR_NOWAIT|PR_LIMITFAIL)) goto fail; } else if ((v = pool_do_get(pp, flags, &slowdown)) == NULL) { if (ISSET(flags, PR_NOWAIT)) goto fail; } pl_leave(pp, &pp->pr_lock); if ((slowdown || pool_debug == 2) && ISSET(flags, PR_WAITOK)) yield(); if (v == NULL) { struct pool_get_memory mem = { .v = NULL }; struct pool_request pr; #ifdef DIAGNOSTIC if (ISSET(flags, PR_WAITOK) && curproc == &proc0) panic("%s: cannot sleep for memory during boot", __func__); #endif pl_init(pp, &mem.lock); pool_request_init(&pr, pool_get_done, &mem); pool_request(pp, &pr); pl_enter(pp, &mem.lock); while (mem.v == NULL) pl_sleep(pp, &mem, &mem.lock, PSWP, pp->pr_wchan); pl_leave(pp, &mem.lock); v = mem.v; } #ifdef MULTIPROCESSOR good: #endif if (ISSET(flags, PR_ZERO)) memset(v, 0, pp->pr_size); TRACEPOINT(uvm, pool_get, pp, v, flags); return (v); fail: pp->pr_nfail++; pl_leave(pp, &pp->pr_lock); return (NULL); } void pool_get_done(struct pool *pp, void *xmem, void *v) { struct pool_get_memory *mem = xmem; pl_enter(pp, &mem->lock); mem->v = v; pl_leave(pp, &mem->lock); wakeup_one(mem); } void pool_runqueue(struct pool *pp, int flags) { struct pool_requests prl = TAILQ_HEAD_INITIALIZER(prl); struct pool_request *pr; pl_assert_unlocked(pp, &pp->pr_lock); pl_assert_locked(pp, &pp->pr_requests_lock); if (pp->pr_requesting++) return; do { pp->pr_requesting = 1; TAILQ_CONCAT(&prl, &pp->pr_requests, pr_entry); if (TAILQ_EMPTY(&prl)) continue; pl_leave(pp, &pp->pr_requests_lock); pl_enter(pp, &pp->pr_lock); pr = TAILQ_FIRST(&prl); while (pr != NULL) { int slowdown = 0; if (pp->pr_nout >= pp->pr_hardlimit) break; pr->pr_item = pool_do_get(pp, flags, &slowdown); if (pr->pr_item == NULL) /* || slowdown ? */ break; pr = TAILQ_NEXT(pr, pr_entry); } pl_leave(pp, &pp->pr_lock); while ((pr = TAILQ_FIRST(&prl)) != NULL && pr->pr_item != NULL) { TAILQ_REMOVE(&prl, pr, pr_entry); (*pr->pr_handler)(pp, pr->pr_cookie, pr->pr_item); } pl_enter(pp, &pp->pr_requests_lock); } while (--pp->pr_requesting); TAILQ_CONCAT(&pp->pr_requests, &prl, pr_entry); } void * pool_do_get(struct pool *pp, int flags, int *slowdown) { struct pool_item *pi; struct pool_page_header *ph; pl_assert_locked(pp, &pp->pr_lock); splassert(pp->pr_ipl); /* * Account for this item now to avoid races if we need to give up * pr_lock to allocate a page. */ pp->pr_nout++; if (pp->pr_curpage == NULL) { pl_leave(pp, &pp->pr_lock); ph = pool_p_alloc(pp, flags, slowdown); pl_enter(pp, &pp->pr_lock); if (ph == NULL) { pp->pr_nout--; return (NULL); } pool_p_insert(pp, ph); } ph = pp->pr_curpage; pi = XSIMPLEQ_FIRST(&ph->ph_items); if (__predict_false(pi == NULL)) panic("%s: %s: page empty", __func__, pp->pr_wchan); if (__predict_false(pi->pi_magic != POOL_IMAGIC(ph, pi))) { panic("%s: %s free list modified: " "page %p; item addr %p; offset 0x%x=0x%lx != 0x%lx", __func__, pp->pr_wchan, ph->ph_page, pi, 0, pi->pi_magic, POOL_IMAGIC(ph, pi)); } XSIMPLEQ_REMOVE_HEAD(&ph->ph_items, pi_list); #ifdef DIAGNOSTIC if (pool_debug && POOL_PHPOISON(ph)) { size_t pidx; uint32_t pval; if (poison_check(pi + 1, pp->pr_size - sizeof(*pi), &pidx, &pval)) { int *ip = (int *)(pi + 1); panic("%s: %s free list modified: " "page %p; item addr %p; offset 0x%zx=0x%x", __func__, pp->pr_wchan, ph->ph_page, pi, (pidx * sizeof(int)) + sizeof(*pi), ip[pidx]); } } #endif /* DIAGNOSTIC */ if (ph->ph_nmissing++ == 0) { /* * This page was previously empty. Move it to the list of * partially-full pages. This page is already curpage. */ TAILQ_REMOVE(&pp->pr_emptypages, ph, ph_entry); TAILQ_INSERT_TAIL(&pp->pr_partpages, ph, ph_entry); pp->pr_nidle--; } if (ph->ph_nmissing == pp->pr_itemsperpage) { /* * This page is now full. Move it to the full list * and select a new current page. */ TAILQ_REMOVE(&pp->pr_partpages, ph, ph_entry); TAILQ_INSERT_TAIL(&pp->pr_fullpages, ph, ph_entry); pool_update_curpage(pp); } pp->pr_nget++; return (pi); } /* * Return resource to the pool. */ void pool_put(struct pool *pp, void *v) { struct pool_page_header *ph, *freeph = NULL; #ifdef DIAGNOSTIC if (v == NULL) panic("%s: NULL item", __func__); #endif TRACEPOINT(uvm, pool_put, pp, v); #ifdef MULTIPROCESSOR if (pp->pr_cache != NULL && TAILQ_EMPTY(&pp->pr_requests)) { pool_cache_put(pp, v); return; } #endif pl_enter(pp, &pp->pr_lock); pool_do_put(pp, v); pp->pr_nout--; pp->pr_nput++; /* is it time to free a page? */ if (pp->pr_nidle > pp->pr_maxpages && (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL && getnsecuptime() - ph->ph_timestamp > POOL_WAIT_FREE) { freeph = ph; pool_p_remove(pp, freeph); } pl_leave(pp, &pp->pr_lock); if (freeph != NULL) pool_p_free(pp, freeph); pool_wakeup(pp); } void pool_wakeup(struct pool *pp) { if (!TAILQ_EMPTY(&pp->pr_requests)) { pl_enter(pp, &pp->pr_requests_lock); pool_runqueue(pp, PR_NOWAIT); pl_leave(pp, &pp->pr_requests_lock); } } void pool_do_put(struct pool *pp, void *v) { struct pool_item *pi = v; struct pool_page_header *ph; splassert(pp->pr_ipl); ph = pr_find_pagehead(pp, v); #ifdef DIAGNOSTIC if (pool_debug) { struct pool_item *qi; XSIMPLEQ_FOREACH(qi, &ph->ph_items, pi_list) { if (pi == qi) { panic("%s: %s: double pool_put: %p", __func__, pp->pr_wchan, pi); } } } #endif /* DIAGNOSTIC */ pi->pi_magic = POOL_IMAGIC(ph, pi); XSIMPLEQ_INSERT_HEAD(&ph->ph_items, pi, pi_list); #ifdef DIAGNOSTIC if (POOL_PHPOISON(ph)) poison_mem(pi + 1, pp->pr_size - sizeof(*pi)); #endif /* DIAGNOSTIC */ if (ph->ph_nmissing-- == pp->pr_itemsperpage) { /* * The page was previously completely full, move it to the * partially-full list. */ TAILQ_REMOVE(&pp->pr_fullpages, ph, ph_entry); TAILQ_INSERT_TAIL(&pp->pr_partpages, ph, ph_entry); } if (ph->ph_nmissing == 0) { /* * The page is now empty, so move it to the empty page list. */ pp->pr_nidle++; ph->ph_timestamp = getnsecuptime(); TAILQ_REMOVE(&pp->pr_partpages, ph, ph_entry); TAILQ_INSERT_TAIL(&pp->pr_emptypages, ph, ph_entry); pool_update_curpage(pp); } } /* * Add N items to the pool. */ int pool_prime(struct pool *pp, int n) { struct pool_pagelist pl = TAILQ_HEAD_INITIALIZER(pl); struct pool_page_header *ph; int newpages; newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; while (newpages-- > 0) { int slowdown = 0; ph = pool_p_alloc(pp, PR_NOWAIT, &slowdown); if (ph == NULL) /* or slowdown? */ break; TAILQ_INSERT_TAIL(&pl, ph, ph_entry); } pl_enter(pp, &pp->pr_lock); while ((ph = TAILQ_FIRST(&pl)) != NULL) { TAILQ_REMOVE(&pl, ph, ph_entry); pool_p_insert(pp, ph); } pl_leave(pp, &pp->pr_lock); return (0); } struct pool_page_header * pool_p_alloc(struct pool *pp, int flags, int *slowdown) { struct pool_page_header *ph; struct pool_item *pi; caddr_t addr; unsigned int order; int o; int n; pl_assert_unlocked(pp, &pp->pr_lock); KASSERT(pp->pr_size >= sizeof(*pi)); addr = pool_allocator_alloc(pp, flags, slowdown); if (addr == NULL) return (NULL); if (POOL_INPGHDR(pp)) ph = (struct pool_page_header *)(addr + pp->pr_phoffset); else { ph = pool_get(&phpool, flags); if (ph == NULL) { pool_allocator_free(pp, addr); return (NULL); } } XSIMPLEQ_INIT(&ph->ph_items); ph->ph_page = addr; addr += pp->pr_align * (pp->pr_npagealloc % pp->pr_maxcolors); ph->ph_colored = addr; ph->ph_nmissing = 0; arc4random_buf(&ph->ph_magic, sizeof(ph->ph_magic)); #ifdef DIAGNOSTIC /* use a bit in ph_magic to record if we poison page items */ if (pool_debug) SET(ph->ph_magic, POOL_MAGICBIT); else CLR(ph->ph_magic, POOL_MAGICBIT); #endif /* DIAGNOSTIC */ n = pp->pr_itemsperpage; o = 32; while (n--) { pi = (struct pool_item *)addr; pi->pi_magic = POOL_IMAGIC(ph, pi); if (o == 32) { order = arc4random(); o = 0; } if (ISSET(order, 1U << o++)) XSIMPLEQ_INSERT_TAIL(&ph->ph_items, pi, pi_list); else XSIMPLEQ_INSERT_HEAD(&ph->ph_items, pi, pi_list); #ifdef DIAGNOSTIC if (POOL_PHPOISON(ph)) poison_mem(pi + 1, pp->pr_size - sizeof(*pi)); #endif /* DIAGNOSTIC */ addr += pp->pr_size; } return (ph); } void pool_p_free(struct pool *pp, struct pool_page_header *ph) { struct pool_item *pi; pl_assert_unlocked(pp, &pp->pr_lock); KASSERT(ph->ph_nmissing == 0); XSIMPLEQ_FOREACH(pi, &ph->ph_items, pi_list) { if (__predict_false(pi->pi_magic != POOL_IMAGIC(ph, pi))) { panic("%s: %s free list modified: " "page %p; item addr %p; offset 0x%x=0x%lx", __func__, pp->pr_wchan, ph->ph_page, pi, 0, pi->pi_magic); } #ifdef DIAGNOSTIC if (POOL_PHPOISON(ph)) { size_t pidx; uint32_t pval; if (poison_check(pi + 1, pp->pr_size - sizeof(*pi), &pidx, &pval)) { int *ip = (int *)(pi + 1); panic("%s: %s free list modified: " "page %p; item addr %p; offset 0x%zx=0x%x", __func__, pp->pr_wchan, ph->ph_page, pi, pidx * sizeof(int), ip[pidx]); } } #endif } pool_allocator_free(pp, ph->ph_page); if (!POOL_INPGHDR(pp)) pool_put(&phpool, ph); } void pool_p_insert(struct pool *pp, struct pool_page_header *ph) { pl_assert_locked(pp, &pp->pr_lock); /* If the pool was depleted, point at the new page */ if (pp->pr_curpage == NULL) pp->pr_curpage = ph; TAILQ_INSERT_TAIL(&pp->pr_emptypages, ph, ph_entry); if (!POOL_INPGHDR(pp)) RBT_INSERT(phtree, &pp->pr_phtree, ph); pp->pr_nitems += pp->pr_itemsperpage; pp->pr_nidle++; pp->pr_npagealloc++; if (++pp->pr_npages > pp->pr_hiwat) pp->pr_hiwat = pp->pr_npages; } void pool_p_remove(struct pool *pp, struct pool_page_header *ph) { pl_assert_locked(pp, &pp->pr_lock); pp->pr_npagefree++; pp->pr_npages--; pp->pr_nidle--; pp->pr_nitems -= pp->pr_itemsperpage; if (!POOL_INPGHDR(pp)) RBT_REMOVE(phtree, &pp->pr_phtree, ph); TAILQ_REMOVE(&pp->pr_emptypages, ph, ph_entry); pool_update_curpage(pp); } void pool_update_curpage(struct pool *pp) { pp->pr_curpage = TAILQ_LAST(&pp->pr_partpages, pool_pagelist); if (pp->pr_curpage == NULL) { pp->pr_curpage = TAILQ_LAST(&pp->pr_emptypages, pool_pagelist); } } void pool_setlowat(struct pool *pp, int n) { int prime = 0; pl_enter(pp, &pp->pr_lock); pp->pr_minitems = n; pp->pr_minpages = (n == 0) ? 0 : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; if (pp->pr_nitems < n) prime = n - pp->pr_nitems; pl_leave(pp, &pp->pr_lock); if (prime > 0) pool_prime(pp, prime); } void pool_sethiwat(struct pool *pp, int n) { pp->pr_maxpages = (n == 0) ? 0 : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; } int pool_sethardlimit(struct pool *pp, u_int n, const char *warnmsg, int ratecap) { int error = 0; if (n < pp->pr_nout) { error = EINVAL; goto done; } pp->pr_hardlimit = n; pp->pr_hardlimit_warning = warnmsg; pp->pr_hardlimit_ratecap.tv_sec = ratecap; pp->pr_hardlimit_warning_last.tv_sec = 0; pp->pr_hardlimit_warning_last.tv_usec = 0; done: return (error); } void pool_set_constraints(struct pool *pp, const struct kmem_pa_mode *mode) { pp->pr_crange = mode; } /* * Release all complete pages that have not been used recently. * * Returns non-zero if any pages have been reclaimed. */ int pool_reclaim(struct pool *pp) { struct pool_page_header *ph, *phnext; struct pool_pagelist pl = TAILQ_HEAD_INITIALIZER(pl); pl_enter(pp, &pp->pr_lock); for (ph = TAILQ_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) { phnext = TAILQ_NEXT(ph, ph_entry); /* Check our minimum page claim */ if (pp->pr_npages <= pp->pr_minpages) break; /* * If freeing this page would put us below * the low water mark, stop now. */ if ((pp->pr_nitems - pp->pr_itemsperpage) < pp->pr_minitems) break; pool_p_remove(pp, ph); TAILQ_INSERT_TAIL(&pl, ph, ph_entry); } pl_leave(pp, &pp->pr_lock); if (TAILQ_EMPTY(&pl)) return (0); while ((ph = TAILQ_FIRST(&pl)) != NULL) { TAILQ_REMOVE(&pl, ph, ph_entry); pool_p_free(pp, ph); } return (1); } /* * Release all complete pages that have not been used recently * from all pools. */ void pool_reclaim_all(void) { struct pool *pp; rw_enter_read(&pool_lock); SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) pool_reclaim(pp); rw_exit_read(&pool_lock); } #ifdef DDB #include <machine/db_machdep.h> #include <ddb/db_output.h> /* * Diagnostic helpers. */ void pool_printit(struct pool *pp, const char *modif, int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) { pool_print1(pp, modif, pr); } void pool_print_pagelist(struct pool_pagelist *pl, int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) { struct pool_page_header *ph; struct pool_item *pi; TAILQ_FOREACH(ph, pl, ph_entry) { (*pr)("\t\tpage %p, color %p, nmissing %d\n", ph->ph_page, ph->ph_colored, ph->ph_nmissing); XSIMPLEQ_FOREACH(pi, &ph->ph_items, pi_list) { if (pi->pi_magic != POOL_IMAGIC(ph, pi)) { (*pr)("\t\t\titem %p, magic 0x%lx\n", pi, pi->pi_magic); } } } } void pool_print1(struct pool *pp, const char *modif, int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) { struct pool_page_header *ph; int print_pagelist = 0; char c; while ((c = *modif++) != '\0') { if (c == 'p') print_pagelist = 1; modif++; } (*pr)("POOL %s: size %u maxcolors %u\n", pp->pr_wchan, pp->pr_size, pp->pr_maxcolors); (*pr)("\talloc %p\n", pp->pr_alloc); (*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n", pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages); (*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n", pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit); (*pr)("\n\tnget %lu, nfail %lu, nput %lu\n", pp->pr_nget, pp->pr_nfail, pp->pr_nput); (*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n", pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle); if (print_pagelist == 0) return; if ((ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL) (*pr)("\n\tempty page list:\n"); pool_print_pagelist(&pp->pr_emptypages, pr); if ((ph = TAILQ_FIRST(&pp->pr_fullpages)) != NULL) (*pr)("\n\tfull page list:\n"); pool_print_pagelist(&pp->pr_fullpages, pr); if ((ph = TAILQ_FIRST(&pp->pr_partpages)) != NULL) (*pr)("\n\tpartial-page list:\n"); pool_print_pagelist(&pp->pr_partpages, pr); if (pp->pr_curpage == NULL) (*pr)("\tno current page\n"); else (*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page); } void db_show_all_pools(db_expr_t expr, int haddr, db_expr_t count, char *modif) { struct pool *pp; char maxp[16]; int ovflw; char mode; mode = modif[0]; if (mode != '\0' && mode != 'a') { db_printf("usage: show all pools [/a]\n"); return; } if (mode == '\0') db_printf("%-10s%4s%9s%5s%9s%6s%6s%6s%6s%6s%6s%5s\n", "Name", "Size", "Requests", "Fail", "Releases", "Pgreq", "Pgrel", "Npage", "Hiwat", "Minpg", "Maxpg", "Idle"); else db_printf("%-12s %18s %18s\n", "Name", "Address", "Allocator"); SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) { if (mode == 'a') { db_printf("%-12s %18p %18p\n", pp->pr_wchan, pp, pp->pr_alloc); continue; } if (!pp->pr_nget) continue; if (pp->pr_maxpages == UINT_MAX) snprintf(maxp, sizeof maxp, "inf"); else snprintf(maxp, sizeof maxp, "%u", pp->pr_maxpages); #define PRWORD(ovflw, fmt, width, fixed, val) do { \ (ovflw) += db_printf((fmt), \ (width) - (fixed) - (ovflw) > 0 ? \ (width) - (fixed) - (ovflw) : 0, \ (val)) - (width); \ if ((ovflw) < 0) \ (ovflw) = 0; \ } while (/* CONSTCOND */0) ovflw = 0; PRWORD(ovflw, "%-*s", 10, 0, pp->pr_wchan); PRWORD(ovflw, " %*u", 4, 1, pp->pr_size); PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nget); PRWORD(ovflw, " %*lu", 5, 1, pp->pr_nfail); PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nput); PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagealloc); PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagefree); PRWORD(ovflw, " %*d", 6, 1, pp->pr_npages); PRWORD(ovflw, " %*d", 6, 1, pp->pr_hiwat); PRWORD(ovflw, " %*d", 6, 1, pp->pr_minpages); PRWORD(ovflw, " %*s", 6, 1, maxp); PRWORD(ovflw, " %*lu\n", 5, 1, pp->pr_nidle); pool_chk(pp); } } #endif /* DDB */ #if defined(POOL_DEBUG) || defined(DDB) int pool_chk_page(struct pool *pp, struct pool_page_header *ph, int expected) { struct pool_item *pi; caddr_t page; int n; const char *label = pp->pr_wchan; page = (caddr_t)((u_long)ph & pp->pr_pgmask); if (page != ph->ph_page && POOL_INPGHDR(pp)) { printf("%s: ", label); printf("pool(%p:%s): page inconsistency: page %p; " "at page head addr %p (p %p)\n", pp, pp->pr_wchan, ph->ph_page, ph, page); return 1; } for (pi = XSIMPLEQ_FIRST(&ph->ph_items), n = 0; pi != NULL; pi = XSIMPLEQ_NEXT(&ph->ph_items, pi, pi_list), n++) { if ((caddr_t)pi < ph->ph_page || (caddr_t)pi >= ph->ph_page + pp->pr_pgsize) { printf("%s: ", label); printf("pool(%p:%s): page inconsistency: page %p;" " item ordinal %d; addr %p\n", pp, pp->pr_wchan, ph->ph_page, n, pi); return (1); } if (pi->pi_magic != POOL_IMAGIC(ph, pi)) { printf("%s: ", label); printf("pool(%p:%s): free list modified: " "page %p; item ordinal %d; addr %p " "(p %p); offset 0x%x=0x%lx\n", pp, pp->pr_wchan, ph->ph_page, n, pi, page, 0, pi->pi_magic); } #ifdef DIAGNOSTIC if (POOL_PHPOISON(ph)) { size_t pidx; uint32_t pval; if (poison_check(pi + 1, pp->pr_size - sizeof(*pi), &pidx, &pval)) { int *ip = (int *)(pi + 1); printf("pool(%s): free list modified: " "page %p; item ordinal %d; addr %p " "(p %p); offset 0x%zx=0x%x\n", pp->pr_wchan, ph->ph_page, n, pi, page, pidx * sizeof(int), ip[pidx]); } } #endif /* DIAGNOSTIC */ } if (n + ph->ph_nmissing != pp->pr_itemsperpage) { printf("pool(%p:%s): page inconsistency: page %p;" " %d on list, %d missing, %d items per page\n", pp, pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing, pp->pr_itemsperpage); return 1; } if (expected >= 0 && n != expected) { printf("pool(%p:%s): page inconsistency: page %p;" " %d on list, %d missing, %d expected\n", pp, pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing, expected); return 1; } return 0; } int pool_chk(struct pool *pp) { struct pool_page_header *ph; int r = 0; TAILQ_FOREACH(ph, &pp->pr_emptypages, ph_entry) r += pool_chk_page(pp, ph, pp->pr_itemsperpage); TAILQ_FOREACH(ph, &pp->pr_fullpages, ph_entry) r += pool_chk_page(pp, ph, 0); TAILQ_FOREACH(ph, &pp->pr_partpages, ph_entry) r += pool_chk_page(pp, ph, -1); return (r); } #endif /* defined(POOL_DEBUG) || defined(DDB) */ #ifdef DDB void pool_walk(struct pool *pp, int full, int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))), void (*func)(void *, int, int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))) { struct pool_page_header *ph; struct pool_item *pi; caddr_t cp; int n; TAILQ_FOREACH(ph, &pp->pr_fullpages, ph_entry) { cp = ph->ph_colored; n = ph->ph_nmissing; while (n--) { func(cp, full, pr); cp += pp->pr_size; } } TAILQ_FOREACH(ph, &pp->pr_partpages, ph_entry) { cp = ph->ph_colored; n = ph->ph_nmissing; do { XSIMPLEQ_FOREACH(pi, &ph->ph_items, pi_list) { if (cp == (caddr_t)pi) break; } if (cp != (caddr_t)pi) { func(cp, full, pr); n--; } cp += pp->pr_size; } while (n > 0); } } #endif /* * We have three different sysctls. * kern.pool.npools - the number of pools. * kern.pool.pool.<pool#> - the pool struct for the pool#. * kern.pool.name.<pool#> - the name for pool#. */ int sysctl_dopool(int *name, u_int namelen, char *oldp, size_t *oldlenp) { struct kinfo_pool pi; struct pool *pp; int rv = ENOENT; switch (name[0]) { case KERN_POOL_NPOOLS: if (namelen != 1) return (ENOTDIR); return (sysctl_rdint(oldp, oldlenp, NULL, pool_count)); case KERN_POOL_NAME: case KERN_POOL_POOL: case KERN_POOL_CACHE: case KERN_POOL_CACHE_CPUS: break; default: return (EOPNOTSUPP); } if (namelen != 2) return (ENOTDIR); rw_enter_read(&pool_lock); SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) { if (name[1] == pp->pr_serial) break; } if (pp == NULL) goto done; switch (name[0]) { case KERN_POOL_NAME: rv = sysctl_rdstring(oldp, oldlenp, NULL, pp->pr_wchan); break; case KERN_POOL_POOL: memset(&pi, 0, sizeof(pi)); pl_enter(pp, &pp->pr_lock); pi.pr_size = pp->pr_size; pi.pr_pgsize = pp->pr_pgsize; pi.pr_itemsperpage = pp->pr_itemsperpage; pi.pr_npages = pp->pr_npages; pi.pr_minpages = pp->pr_minpages; pi.pr_maxpages = pp->pr_maxpages; pi.pr_hardlimit = pp->pr_hardlimit; pi.pr_nout = pp->pr_nout; pi.pr_nitems = pp->pr_nitems; pi.pr_nget = pp->pr_nget; pi.pr_nput = pp->pr_nput; pi.pr_nfail = pp->pr_nfail; pi.pr_npagealloc = pp->pr_npagealloc; pi.pr_npagefree = pp->pr_npagefree; pi.pr_hiwat = pp->pr_hiwat; pi.pr_nidle = pp->pr_nidle; pl_leave(pp, &pp->pr_lock); pool_cache_pool_info(pp, &pi); rv = sysctl_rdstruct(oldp, oldlenp, NULL, &pi, sizeof(pi)); break; case KERN_POOL_CACHE: rv = pool_cache_info(pp, oldp, oldlenp); break; case KERN_POOL_CACHE_CPUS: rv = pool_cache_cpus_info(pp, oldp, oldlenp); break; } done: rw_exit_read(&pool_lock); return (rv); } void pool_gc_sched(void *null) { task_add(systqmp, &pool_gc_task); } void pool_gc_pages(void *null) { struct pool *pp; struct pool_page_header *ph, *freeph; int s; rw_enter_read(&pool_lock); s = splvm(); /* XXX go to splvm until all pools _setipl properly */ SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) { #ifdef MULTIPROCESSOR if (pp->pr_cache != NULL) pool_cache_gc(pp); #endif if (pp->pr_nidle <= pp->pr_minpages || /* guess */ !pl_enter_try(pp, &pp->pr_lock)) /* try */ continue; /* is it time to free a page? */ if (pp->pr_nidle > pp->pr_minpages && (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL && getnsecuptime() - ph->ph_timestamp > POOL_WAIT_GC) { freeph = ph; pool_p_remove(pp, freeph); } else freeph = NULL; pl_leave(pp, &pp->pr_lock); if (freeph != NULL) pool_p_free(pp, freeph); } splx(s); rw_exit_read(&pool_lock); timeout_add_sec(&pool_gc_tick, 1); } /* * Pool backend allocators. */ void * pool_allocator_alloc(struct pool *pp, int flags, int *slowdown) { void *v; v = (*pp->pr_alloc->pa_alloc)(pp, flags, slowdown); #ifdef DIAGNOSTIC if (v != NULL && POOL_INPGHDR(pp)) { vaddr_t addr = (vaddr_t)v; if ((addr & pp->pr_pgmask) != addr) { panic("%s: %s page address %p isn't aligned to %u", __func__, pp->pr_wchan, v, pp->pr_pgsize); } } #endif return (v); } void pool_allocator_free(struct pool *pp, void *v) { struct pool_allocator *pa = pp->pr_alloc; (*pa->pa_free)(pp, v); } void * pool_page_alloc(struct pool *pp, int flags, int *slowdown) { struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER; kd.kd_waitok = ISSET(flags, PR_WAITOK); kd.kd_slowdown = slowdown; return (km_alloc(pp->pr_pgsize, &kv_page, pp->pr_crange, &kd)); } void pool_page_free(struct pool *pp, void *v) { km_free(v, pp->pr_pgsize, &kv_page, pp->pr_crange); } void * pool_multi_alloc(struct pool *pp, int flags, int *slowdown) { struct kmem_va_mode kv = kv_intrsafe; struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER; void *v; int s; if (POOL_INPGHDR(pp)) kv.kv_align = pp->pr_pgsize; kd.kd_waitok = ISSET(flags, PR_WAITOK); kd.kd_slowdown = slowdown; s = splvm(); v = km_alloc(pp->pr_pgsize, &kv, pp->pr_crange, &kd); splx(s); return (v); } void pool_multi_free(struct pool *pp, void *v) { struct kmem_va_mode kv = kv_intrsafe; int s; if (POOL_INPGHDR(pp)) kv.kv_align = pp->pr_pgsize; s = splvm(); km_free(v, pp->pr_pgsize, &kv, pp->pr_crange); splx(s); } void * pool_multi_alloc_ni(struct pool *pp, int flags, int *slowdown) { struct kmem_va_mode kv = kv_any; struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER; void *v; if (POOL_INPGHDR(pp)) kv.kv_align = pp->pr_pgsize; kd.kd_waitok = ISSET(flags, PR_WAITOK); kd.kd_slowdown = slowdown; KERNEL_LOCK(); v = km_alloc(pp->pr_pgsize, &kv, pp->pr_crange, &kd); KERNEL_UNLOCK(); return (v); } void pool_multi_free_ni(struct pool *pp, void *v) { struct kmem_va_mode kv = kv_any; if (POOL_INPGHDR(pp)) kv.kv_align = pp->pr_pgsize; KERNEL_LOCK(); km_free(v, pp->pr_pgsize, &kv, pp->pr_crange); KERNEL_UNLOCK(); } #ifdef MULTIPROCESSOR struct pool pool_caches; /* per cpu cache entries */ void pool_cache_init(struct pool *pp) { struct cpumem *cm; struct pool_cache *pc; struct cpumem_iter i; if (pool_caches.pr_size == 0) { pool_init(&pool_caches, sizeof(struct pool_cache), CACHELINESIZE, IPL_NONE, PR_WAITOK | PR_RWLOCK, "plcache", NULL); } /* must be able to use the pool items as cache list items */ KASSERT(pp->pr_size >= sizeof(struct pool_cache_item)); cm = cpumem_get(&pool_caches); pl_init(pp, &pp->pr_cache_lock); arc4random_buf(pp->pr_cache_magic, sizeof(pp->pr_cache_magic)); TAILQ_INIT(&pp->pr_cache_lists); pp->pr_cache_nitems = 0; pp->pr_cache_timestamp = getnsecuptime(); pp->pr_cache_items = 8; pp->pr_cache_contention = 0; pp->pr_cache_ngc = 0; CPUMEM_FOREACH(pc, &i, cm) { pc->pc_actv = NULL; pc->pc_nactv = 0; pc->pc_prev = NULL; pc->pc_nget = 0; pc->pc_nfail = 0; pc->pc_nput = 0; pc->pc_nlget = 0; pc->pc_nlfail = 0; pc->pc_nlput = 0; pc->pc_nout = 0; } membar_producer(); pp->pr_cache = cm; } static inline void pool_cache_item_magic(struct pool *pp, struct pool_cache_item *ci) { unsigned long *entry = (unsigned long *)&ci->ci_nextl; entry[0] = pp->pr_cache_magic[0] ^ (u_long)ci; entry[1] = pp->pr_cache_magic[1] ^ (u_long)ci->ci_next; } static inline void pool_cache_item_magic_check(struct pool *pp, struct pool_cache_item *ci) { unsigned long *entry; unsigned long val; entry = (unsigned long *)&ci->ci_nextl; val = pp->pr_cache_magic[0] ^ (u_long)ci; if (*entry != val) goto fail; entry++; val = pp->pr_cache_magic[1] ^ (u_long)ci->ci_next; if (*entry != val) goto fail; return; fail: panic("%s: %s cpu free list modified: item addr %p+%zu 0x%lx!=0x%lx", __func__, pp->pr_wchan, ci, (caddr_t)entry - (caddr_t)ci, *entry, val); } static inline void pool_list_enter(struct pool *pp) { if (pl_enter_try(pp, &pp->pr_cache_lock) == 0) { pl_enter(pp, &pp->pr_cache_lock); pp->pr_cache_contention++; } } static inline void pool_list_leave(struct pool *pp) { pl_leave(pp, &pp->pr_cache_lock); } static inline struct pool_cache_item * pool_cache_list_alloc(struct pool *pp, struct pool_cache *pc) { struct pool_cache_item *pl; pool_list_enter(pp); pl = TAILQ_FIRST(&pp->pr_cache_lists); if (pl != NULL) { TAILQ_REMOVE(&pp->pr_cache_lists, pl, ci_nextl); pp->pr_cache_nitems -= POOL_CACHE_ITEM_NITEMS(pl); pool_cache_item_magic(pp, pl); pc->pc_nlget++; } else pc->pc_nlfail++; /* fold this cpus nout into the global while we have the lock */ pp->pr_cache_nout += pc->pc_nout; pc->pc_nout = 0; pool_list_leave(pp); return (pl); } static inline void pool_cache_list_free(struct pool *pp, struct pool_cache *pc, struct pool_cache_item *ci) { pool_list_enter(pp); if (TAILQ_EMPTY(&pp->pr_cache_lists)) pp->pr_cache_timestamp = getnsecuptime(); pp->pr_cache_nitems += POOL_CACHE_ITEM_NITEMS(ci); TAILQ_INSERT_TAIL(&pp->pr_cache_lists, ci, ci_nextl); pc->pc_nlput++; /* fold this cpus nout into the global while we have the lock */ pp->pr_cache_nout += pc->pc_nout; pc->pc_nout = 0; pool_list_leave(pp); } static inline struct pool_cache * pool_cache_enter(struct pool *pp, int *s) { struct pool_cache *pc; pc = cpumem_enter(pp->pr_cache); *s = splraise(pp->pr_ipl); pc->pc_gen++; return (pc); } static inline void pool_cache_leave(struct pool *pp, struct pool_cache *pc, int s) { pc->pc_gen++; splx(s); cpumem_leave(pp->pr_cache, pc); } void * pool_cache_get(struct pool *pp) { struct pool_cache *pc; struct pool_cache_item *ci; int s; pc = pool_cache_enter(pp, &s); if (pc->pc_actv != NULL) { ci = pc->pc_actv; } else if (pc->pc_prev != NULL) { ci = pc->pc_prev; pc->pc_prev = NULL; } else if ((ci = pool_cache_list_alloc(pp, pc)) == NULL) { pc->pc_nfail++; goto done; } pool_cache_item_magic_check(pp, ci); #ifdef DIAGNOSTIC if (pool_debug && POOL_CACHE_ITEM_POISONED(ci)) { size_t pidx; uint32_t pval; if (poison_check(ci + 1, pp->pr_size - sizeof(*ci), &pidx, &pval)) { int *ip = (int *)(ci + 1); ip += pidx; panic("%s: %s cpu free list modified: " "item addr %p+%zu 0x%x!=0x%x", __func__, pp->pr_wchan, ci, (caddr_t)ip - (caddr_t)ci, *ip, pval); } } #endif pc->pc_actv = ci->ci_next; pc->pc_nactv = POOL_CACHE_ITEM_NITEMS(ci) - 1; pc->pc_nget++; pc->pc_nout++; done: pool_cache_leave(pp, pc, s); return (ci); } void pool_cache_put(struct pool *pp, void *v) { struct pool_cache *pc; struct pool_cache_item *ci = v; unsigned long nitems; int s; #ifdef DIAGNOSTIC int poison = pool_debug && pp->pr_size > sizeof(*ci); if (poison) poison_mem(ci + 1, pp->pr_size - sizeof(*ci)); #endif pc = pool_cache_enter(pp, &s); nitems = pc->pc_nactv; if (nitems >= pp->pr_cache_items) { if (pc->pc_prev != NULL) pool_cache_list_free(pp, pc, pc->pc_prev); pc->pc_prev = pc->pc_actv; pc->pc_actv = NULL; pc->pc_nactv = 0; nitems = 0; } ci->ci_next = pc->pc_actv; ci->ci_nitems = ++nitems; #ifdef DIAGNOSTIC ci->ci_nitems |= poison ? POOL_CACHE_ITEM_NITEMS_POISON : 0; #endif pool_cache_item_magic(pp, ci); pc->pc_actv = ci; pc->pc_nactv = nitems; pc->pc_nput++; pc->pc_nout--; pool_cache_leave(pp, pc, s); } struct pool_cache_item * pool_cache_list_put(struct pool *pp, struct pool_cache_item *pl) { struct pool_cache_item *rpl, *next; if (pl == NULL) return (NULL); rpl = TAILQ_NEXT(pl, ci_nextl); pl_enter(pp, &pp->pr_lock); do { next = pl->ci_next; pool_do_put(pp, pl); pl = next; } while (pl != NULL); pl_leave(pp, &pp->pr_lock); return (rpl); } void pool_cache_destroy(struct pool *pp) { struct pool_cache *pc; struct pool_cache_item *pl; struct cpumem_iter i; struct cpumem *cm; rw_enter_write(&pool_lock); /* serialise with the gc */ cm = pp->pr_cache; pp->pr_cache = NULL; /* make pool_put avoid the cache */ rw_exit_write(&pool_lock); CPUMEM_FOREACH(pc, &i, cm) { pool_cache_list_put(pp, pc->pc_actv); pool_cache_list_put(pp, pc->pc_prev); } cpumem_put(&pool_caches, cm); pl = TAILQ_FIRST(&pp->pr_cache_lists); while (pl != NULL) pl = pool_cache_list_put(pp, pl); } void pool_cache_gc(struct pool *pp) { unsigned int contention, delta; if (getnsecuptime() - pp->pr_cache_timestamp > POOL_WAIT_GC && !TAILQ_EMPTY(&pp->pr_cache_lists) && pl_enter_try(pp, &pp->pr_cache_lock)) { struct pool_cache_item *pl = NULL; pl = TAILQ_FIRST(&pp->pr_cache_lists); if (pl != NULL) { TAILQ_REMOVE(&pp->pr_cache_lists, pl, ci_nextl); pp->pr_cache_nitems -= POOL_CACHE_ITEM_NITEMS(pl); pp->pr_cache_timestamp = getnsecuptime(); pp->pr_cache_ngc++; } pl_leave(pp, &pp->pr_cache_lock); pool_cache_list_put(pp, pl); } /* * if there's a lot of contention on the pr_cache_mtx then consider * growing the length of the list to reduce the need to access the * global pool. */ contention = pp->pr_cache_contention; delta = contention - pp->pr_cache_contention_prev; if (delta > 8 /* magic */) { if ((ncpusfound * 8 * 2) <= pp->pr_cache_nitems) pp->pr_cache_items += 8; } else if (delta == 0) { if (pp->pr_cache_items > 8) pp->pr_cache_items--; } pp->pr_cache_contention_prev = contention; } void pool_cache_pool_info(struct pool *pp, struct kinfo_pool *pi) { struct pool_cache *pc; struct cpumem_iter i; if (pp->pr_cache == NULL) return; /* loop through the caches twice to collect stats */ /* once without the lock so we can yield while reading nget/nput */ CPUMEM_FOREACH(pc, &i, pp->pr_cache) { uint64_t gen, nget, nput; do { while ((gen = pc->pc_gen) & 1) yield(); nget = pc->pc_nget; nput = pc->pc_nput; } while (gen != pc->pc_gen); pi->pr_nget += nget; pi->pr_nput += nput; } /* and once with the mtx so we can get consistent nout values */ pl_enter(pp, &pp->pr_cache_lock); CPUMEM_FOREACH(pc, &i, pp->pr_cache) pi->pr_nout += pc->pc_nout; pi->pr_nout += pp->pr_cache_nout; pl_leave(pp, &pp->pr_cache_lock); } int pool_cache_info(struct pool *pp, void *oldp, size_t *oldlenp) { struct kinfo_pool_cache kpc; if (pp->pr_cache == NULL) return (EOPNOTSUPP); memset(&kpc, 0, sizeof(kpc)); /* don't leak padding */ pl_enter(pp, &pp->pr_cache_lock); kpc.pr_ngc = pp->pr_cache_ngc; kpc.pr_len = pp->pr_cache_items; kpc.pr_nitems = pp->pr_cache_nitems; kpc.pr_contention = pp->pr_cache_contention; pl_leave(pp, &pp->pr_cache_lock); return (sysctl_rdstruct(oldp, oldlenp, NULL, &kpc, sizeof(kpc))); } int pool_cache_cpus_info(struct pool *pp, void *oldp, size_t *oldlenp) { struct pool_cache *pc; struct kinfo_pool_cache_cpu *kpcc, *info; unsigned int cpu = 0; struct cpumem_iter i; int error = 0; size_t len; if (pp->pr_cache == NULL) return (EOPNOTSUPP); if (*oldlenp % sizeof(*kpcc)) return (EINVAL); kpcc = mallocarray(ncpusfound, sizeof(*kpcc), M_TEMP, M_WAITOK|M_CANFAIL|M_ZERO); if (kpcc == NULL) return (EIO); len = ncpusfound * sizeof(*kpcc); CPUMEM_FOREACH(pc, &i, pp->pr_cache) { uint64_t gen; if (cpu >= ncpusfound) { error = EIO; goto err; } info = &kpcc[cpu]; info->pr_cpu = cpu; do { while ((gen = pc->pc_gen) & 1) yield(); info->pr_nget = pc->pc_nget; info->pr_nfail = pc->pc_nfail; info->pr_nput = pc->pc_nput; info->pr_nlget = pc->pc_nlget; info->pr_nlfail = pc->pc_nlfail; info->pr_nlput = pc->pc_nlput; } while (gen != pc->pc_gen); cpu++; } error = sysctl_rdstruct(oldp, oldlenp, NULL, kpcc, len); err: free(kpcc, M_TEMP, len); return (error); } #else /* MULTIPROCESSOR */ void pool_cache_init(struct pool *pp) { /* nop */ } void pool_cache_pool_info(struct pool *pp, struct kinfo_pool *pi) { /* nop */ } int pool_cache_info(struct pool *pp, void *oldp, size_t *oldlenp) { return (EOPNOTSUPP); } int pool_cache_cpus_info(struct pool *pp, void *oldp, size_t *oldlenp) { return (EOPNOTSUPP); } #endif /* MULTIPROCESSOR */ void pool_lock_mtx_init(struct pool *pp, union pool_lock *lock, const struct lock_type *type) { _mtx_init_flags(&lock->prl_mtx, pp->pr_ipl, pp->pr_wchan, 0, type); } void pool_lock_mtx_enter(union pool_lock *lock) { mtx_enter(&lock->prl_mtx); } int pool_lock_mtx_enter_try(union pool_lock *lock) { return (mtx_enter_try(&lock->prl_mtx)); } void pool_lock_mtx_leave(union pool_lock *lock) { mtx_leave(&lock->prl_mtx); } void pool_lock_mtx_assert_locked(union pool_lock *lock) { MUTEX_ASSERT_LOCKED(&lock->prl_mtx); } void pool_lock_mtx_assert_unlocked(union pool_lock *lock) { MUTEX_ASSERT_UNLOCKED(&lock->prl_mtx); } int pool_lock_mtx_sleep(void *ident, union pool_lock *lock, int priority, const char *wmesg) { return msleep_nsec(ident, &lock->prl_mtx, priority, wmesg, INFSLP); } static const struct pool_lock_ops pool_lock_ops_mtx = { pool_lock_mtx_init, pool_lock_mtx_enter, pool_lock_mtx_enter_try, pool_lock_mtx_leave, pool_lock_mtx_assert_locked, pool_lock_mtx_assert_unlocked, pool_lock_mtx_sleep, }; void pool_lock_rw_init(struct pool *pp, union pool_lock *lock, const struct lock_type *type) { _rw_init_flags(&lock->prl_rwlock, pp->pr_wchan, 0, type); } void pool_lock_rw_enter(union pool_lock *lock) { rw_enter_write(&lock->prl_rwlock); } int pool_lock_rw_enter_try(union pool_lock *lock) { return (rw_enter(&lock->prl_rwlock, RW_WRITE | RW_NOSLEEP) == 0); } void pool_lock_rw_leave(union pool_lock *lock) { rw_exit_write(&lock->prl_rwlock); } void pool_lock_rw_assert_locked(union pool_lock *lock) { rw_assert_wrlock(&lock->prl_rwlock); } void pool_lock_rw_assert_unlocked(union pool_lock *lock) { KASSERT(rw_status(&lock->prl_rwlock) != RW_WRITE); } int pool_lock_rw_sleep(void *ident, union pool_lock *lock, int priority, const char *wmesg) { return rwsleep_nsec(ident, &lock->prl_rwlock, priority, wmesg, INFSLP); } static const struct pool_lock_ops pool_lock_ops_rw = { pool_lock_rw_init, pool_lock_rw_enter, pool_lock_rw_enter_try, pool_lock_rw_leave, pool_lock_rw_assert_locked, pool_lock_rw_assert_unlocked, pool_lock_rw_sleep, };
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 /* $OpenBSD: icmp6.h,v 1.51 2021/01/11 13:28:53 bluhm Exp $ */ /* $KAME: icmp6.h,v 1.84 2003/04/23 10:26:51 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.h 8.1 (Berkeley) 6/10/93 */ #ifndef _NETINET_ICMP6_H_ #define _NETINET_ICMP6_H_ #define ICMPV6_PLD_MAXLEN 1232 /* IPV6_MMTU - sizeof(struct ip6_hdr) - sizeof(struct icmp6_hdr) */ struct icmp6_hdr { u_int8_t icmp6_type; /* type field */ u_int8_t icmp6_code; /* code field */ u_int16_t icmp6_cksum; /* checksum field */ union { u_int32_t icmp6_un_data32[1]; /* type-specific field */ u_int16_t icmp6_un_data16[2]; /* type-specific field */ u_int8_t icmp6_un_data8[4]; /* type-specific field */ } icmp6_dataun; } __packed; #define icmp6_data32 icmp6_dataun.icmp6_un_data32 #define icmp6_data16 icmp6_dataun.icmp6_un_data16 #define icmp6_data8 icmp6_dataun.icmp6_un_data8 #define icmp6_pptr icmp6_data32[0] /* parameter prob */ #define icmp6_mtu icmp6_data32[0] /* packet too big */ #define icmp6_id icmp6_data16[0] /* echo request/reply */ #define icmp6_seq icmp6_data16[1] /* echo request/reply */ #define icmp6_maxdelay icmp6_data16[0] /* mcast group membership */ #define ICMP6_DST_UNREACH 1 /* dest unreachable, codes: */ #define ICMP6_PACKET_TOO_BIG 2 /* packet too big */ #define ICMP6_TIME_EXCEEDED 3 /* time exceeded, code: */ #define ICMP6_PARAM_PROB 4 /* ip6 header bad */ #define ICMP6_ECHO_REQUEST 128 /* echo service */ #define ICMP6_ECHO_REPLY 129 /* echo reply */ #define MLD_LISTENER_QUERY 130 /* multicast listener query */ #define MLD_LISTENER_REPORT 131 /* multicast listener report */ #define MLD_LISTENER_DONE 132 /* multicast listener done */ /* RFC2292 decls */ #define ICMP6_MEMBERSHIP_QUERY 130 /* group membership query */ #define ICMP6_MEMBERSHIP_REPORT 131 /* group membership report */ #define ICMP6_MEMBERSHIP_REDUCTION 132 /* group membership termination */ #define ND_ROUTER_SOLICIT 133 /* router solicitation */ #define ND_ROUTER_ADVERT 134 /* router advertisement */ #define ND_NEIGHBOR_SOLICIT 135 /* neighbor solicitation */ #define ND_NEIGHBOR_ADVERT 136 /* neighbor advertisement */ #define ND_REDIRECT 137 /* redirect */ #define ICMP6_ROUTER_RENUMBERING 138 /* router renumbering */ #define ICMP6_WRUREQUEST 139 /* who are you request */ #define ICMP6_WRUREPLY 140 /* who are you reply */ #define ICMP6_FQDN_QUERY 139 /* FQDN query */ #define ICMP6_FQDN_REPLY 140 /* FQDN reply */ #define ICMP6_NI_QUERY 139 /* node information request */ #define ICMP6_NI_REPLY 140 /* node information reply */ #define MLDV2_LISTENER_REPORT 143 /* RFC3810 listener report */ /* The definitions below are experimental. TBA */ #define MLD_MTRACE_RESP 200 /* mtrace response(to sender) */ #define MLD_MTRACE 201 /* mtrace messages */ #define ICMP6_MAXTYPE 201 #define ICMP6_DST_UNREACH_NOROUTE 0 /* no route to destination */ #define ICMP6_DST_UNREACH_ADMIN 1 /* administratively prohibited */ #define ICMP6_DST_UNREACH_BEYONDSCOPE 2 /* beyond scope of source address */ #define ICMP6_DST_UNREACH_ADDR 3 /* address unreachable */ #define ICMP6_DST_UNREACH_NOPORT 4 /* port unreachable */ #define ICMP6_TIME_EXCEED_TRANSIT 0 /* ttl==0 in transit */ #define ICMP6_TIME_EXCEED_REASSEMBLY 1 /* ttl==0 in reass */ #define ICMP6_PARAMPROB_HEADER 0 /* erroneous header field */ #define ICMP6_PARAMPROB_NEXTHEADER 1 /* unrecognized next header */ #define ICMP6_PARAMPROB_OPTION 2 /* unrecognized option */ #define ICMP6_INFOMSG_MASK 0x80 /* all informational messages */ #define ICMP6_NI_SUBJ_IPV6 0 /* Query Subject is an IPv6 address */ #define ICMP6_NI_SUBJ_FQDN 1 /* Query Subject is a Domain name */ #define ICMP6_NI_SUBJ_IPV4 2 /* Query Subject is an IPv4 address */ #define ICMP6_NI_SUCCESS 0 /* node information successful reply */ #define ICMP6_NI_REFUSED 1 /* node information request is refused */ #define ICMP6_NI_UNKNOWN 2 /* unknown Qtype */ #define ICMP6_ROUTER_RENUMBERING_COMMAND 0 /* rr command */ #define ICMP6_ROUTER_RENUMBERING_RESULT 1 /* rr result */ #define ICMP6_ROUTER_RENUMBERING_SEQNUM_RESET 255 /* rr seq num reset */ /* Used in kernel only */ #define ND_REDIRECT_ONLINK 0 /* redirect to an on-link node */ #define ND_REDIRECT_ROUTER 1 /* redirect to a better router */ /* * Multicast Listener Discovery */ struct mld_hdr { struct icmp6_hdr mld_icmp6_hdr; struct in6_addr mld_addr; /* multicast address */ } __packed; /* shortcut macro definitions */ #define mld_type mld_icmp6_hdr.icmp6_type #define mld_code mld_icmp6_hdr.icmp6_code #define mld_cksum mld_icmp6_hdr.icmp6_cksum #define mld_maxdelay mld_icmp6_hdr.icmp6_data16[0] #define mld_reserved mld_icmp6_hdr.icmp6_data16[1] /* * Neighbor Discovery */ struct nd_router_solicit { /* router solicitation */ struct icmp6_hdr nd_rs_hdr; /* could be followed by options */ } __packed; #define nd_rs_type nd_rs_hdr.icmp6_type #define nd_rs_code nd_rs_hdr.icmp6_code #define nd_rs_cksum nd_rs_hdr.icmp6_cksum #define nd_rs_reserved nd_rs_hdr.icmp6_data32[0] struct nd_router_advert { /* router advertisement */ struct icmp6_hdr nd_ra_hdr; u_int32_t nd_ra_reachable; /* reachable time */ u_int32_t nd_ra_retransmit; /* retransmit timer */ /* could be followed by options */ } __packed; #define nd_ra_type nd_ra_hdr.icmp6_type #define nd_ra_code nd_ra_hdr.icmp6_code #define nd_ra_cksum nd_ra_hdr.icmp6_cksum #define nd_ra_curhoplimit nd_ra_hdr.icmp6_data8[0] #define nd_ra_flags_reserved nd_ra_hdr.icmp6_data8[1] #define ND_RA_FLAG_MANAGED 0x80 #define ND_RA_FLAG_OTHER 0x40 #define ND_RA_FLAG_RTPREF_MASK 0x18 /* 00011000 */ #define ND_RA_FLAG_RTPREF_HIGH 0x08 /* 00001000 */ #define ND_RA_FLAG_RTPREF_MEDIUM 0x00 /* 00000000 */ #define ND_RA_FLAG_RTPREF_LOW 0x18 /* 00011000 */ #define ND_RA_FLAG_RTPREF_RSV 0x10 /* 00010000 */ #define nd_ra_router_lifetime nd_ra_hdr.icmp6_data16[1] struct nd_neighbor_solicit { /* neighbor solicitation */ struct icmp6_hdr nd_ns_hdr; struct in6_addr nd_ns_target; /*target address */ /* could be followed by options */ } __packed; #define nd_ns_type nd_ns_hdr.icmp6_type #define nd_ns_code nd_ns_hdr.icmp6_code #define nd_ns_cksum nd_ns_hdr.icmp6_cksum #define nd_ns_reserved nd_ns_hdr.icmp6_data32[0] struct nd_neighbor_advert { /* neighbor advertisement */ struct icmp6_hdr nd_na_hdr; struct in6_addr nd_na_target; /* target address */ /* could be followed by options */ } __packed; #define nd_na_type nd_na_hdr.icmp6_type #define nd_na_code nd_na_hdr.icmp6_code #define nd_na_cksum nd_na_hdr.icmp6_cksum #define nd_na_flags_reserved nd_na_hdr.icmp6_data32[0] #define ND_NA_FLAG_ROUTER htonl(0x80000000) #define ND_NA_FLAG_SOLICITED htonl(0x40000000) #define ND_NA_FLAG_OVERRIDE htonl(0x20000000) struct nd_redirect { /* redirect */ struct icmp6_hdr nd_rd_hdr; struct in6_addr nd_rd_target; /* target address */ struct in6_addr nd_rd_dst; /* destination address */ /* could be followed by options */ } __packed; #define nd_rd_type nd_rd_hdr.icmp6_type #define nd_rd_code nd_rd_hdr.icmp6_code #define nd_rd_cksum nd_rd_hdr.icmp6_cksum #define nd_rd_reserved nd_rd_hdr.icmp6_data32[0] struct nd_opt_hdr { /* Neighbor discovery option header */ u_int8_t nd_opt_type; u_int8_t nd_opt_len; /* followed by option specific data*/ } __packed; #define ND_OPT_SOURCE_LINKADDR 1 #define ND_OPT_TARGET_LINKADDR 2 #define ND_OPT_PREFIX_INFORMATION 3 #define ND_OPT_REDIRECTED_HEADER 4 #define ND_OPT_MTU 5 #define ND_OPT_ROUTE_INFO 24 #define ND_OPT_RDNSS 25 #define ND_OPT_DNSSL 31 struct nd_opt_prefix_info { /* prefix information */ u_int8_t nd_opt_pi_type; u_int8_t nd_opt_pi_len; u_int8_t nd_opt_pi_prefix_len; u_int8_t nd_opt_pi_flags_reserved; u_int32_t nd_opt_pi_valid_time; u_int32_t nd_opt_pi_preferred_time; u_int32_t nd_opt_pi_reserved2; struct in6_addr nd_opt_pi_prefix; } __packed; #define ND_OPT_PI_FLAG_ONLINK 0x80 #define ND_OPT_PI_FLAG_AUTO 0x40 struct nd_opt_rd_hdr { /* redirected header */ u_int8_t nd_opt_rh_type; u_int8_t nd_opt_rh_len; u_int16_t nd_opt_rh_reserved1; u_int32_t nd_opt_rh_reserved2; /* followed by IP header and data */ } __packed; struct nd_opt_mtu { /* MTU option */ u_int8_t nd_opt_mtu_type; u_int8_t nd_opt_mtu_len; u_int16_t nd_opt_mtu_reserved; u_int32_t nd_opt_mtu_mtu; } __packed; struct nd_opt_route_info { /* route info */ u_int8_t nd_opt_rti_type; u_int8_t nd_opt_rti_len; u_int8_t nd_opt_rti_prefixlen; u_int8_t nd_opt_rti_flags; u_int32_t nd_opt_rti_lifetime; } __packed; struct nd_opt_rdnss { /* RDNSS option */ u_int8_t nd_opt_rdnss_type; u_int8_t nd_opt_rdnss_len; u_int16_t nd_opt_rdnss_reserved; u_int32_t nd_opt_rdnss_lifetime; /* followed by list of recursive DNS servers */ } __packed; struct nd_opt_dnssl { /* DNSSL option */ u_int8_t nd_opt_dnssl_type; u_int8_t nd_opt_dnssl_len; u_int16_t nd_opt_dnssl_reserved; u_int32_t nd_opt_dnssl_lifetime; /* followed by list of DNS search domains */ } __packed; /* * icmp6 namelookup */ struct icmp6_namelookup { struct icmp6_hdr icmp6_nl_hdr; u_int8_t icmp6_nl_nonce[8]; int32_t icmp6_nl_ttl; #if 0 u_int8_t icmp6_nl_len; u_int8_t icmp6_nl_name[3]; #endif /* could be followed by options */ } __packed; /* * icmp6 node information */ struct icmp6_nodeinfo { struct icmp6_hdr icmp6_ni_hdr; u_int8_t icmp6_ni_nonce[8]; /* could be followed by reply data */ } __packed; #define ni_type icmp6_ni_hdr.icmp6_type #define ni_code icmp6_ni_hdr.icmp6_code #define ni_cksum icmp6_ni_hdr.icmp6_cksum #define ni_qtype icmp6_ni_hdr.icmp6_data16[0] #define ni_flags icmp6_ni_hdr.icmp6_data16[1] #define NI_QTYPE_NOOP 0 /* NOOP */ #define NI_QTYPE_SUPTYPES 1 /* Supported Qtypes */ #define NI_QTYPE_FQDN 2 /* FQDN (draft 04) */ #define NI_QTYPE_DNSNAME 2 /* DNS Name */ #define NI_QTYPE_NODEADDR 3 /* Node Addresses */ #define NI_QTYPE_IPV4ADDR 4 /* IPv4 Addresses */ #define NI_SUPTYPE_FLAG_COMPRESS htons(0x0001) #define NI_FQDN_FLAG_VALIDTTL htons(0x0001) #define NI_NODEADDR_FLAG_TRUNCATE htons(0x0001) #define NI_NODEADDR_FLAG_ALL htons(0x0002) #define NI_NODEADDR_FLAG_COMPAT htons(0x0004) #define NI_NODEADDR_FLAG_LINKLOCAL htons(0x0008) #define NI_NODEADDR_FLAG_SITELOCAL htons(0x0010) #define NI_NODEADDR_FLAG_GLOBAL htons(0x0020) #define NI_NODEADDR_FLAG_ANYCAST htons(0x0040) /* not in spec */ struct ni_reply_fqdn { u_int32_t ni_fqdn_ttl; /* TTL */ u_int8_t ni_fqdn_namelen; /* length in octets of the FQDN */ u_int8_t ni_fqdn_name[3]; /* XXX: alignment */ } __packed; /* * Router Renumbering. as router-renum-08.txt */ struct icmp6_router_renum { /* router renumbering header */ struct icmp6_hdr rr_hdr; u_int8_t rr_segnum; u_int8_t rr_flags; u_int16_t rr_maxdelay; u_int32_t rr_reserved; } __packed; #define ICMP6_RR_FLAGS_TEST 0x80 #define ICMP6_RR_FLAGS_REQRESULT 0x40 #define ICMP6_RR_FLAGS_FORCEAPPLY 0x20 #define ICMP6_RR_FLAGS_SPECSITE 0x10 #define ICMP6_RR_FLAGS_PREVDONE 0x08 #define rr_type rr_hdr.icmp6_type #define rr_code rr_hdr.icmp6_code #define rr_cksum rr_hdr.icmp6_cksum #define rr_seqnum rr_hdr.icmp6_data32[0] struct rr_pco_match { /* match prefix part */ u_int8_t rpm_code; u_int8_t rpm_len; u_int8_t rpm_ordinal; u_int8_t rpm_matchlen; u_int8_t rpm_minlen; u_int8_t rpm_maxlen; u_int16_t rpm_reserved; struct in6_addr rpm_prefix; } __packed; #define RPM_PCO_ADD 1 #define RPM_PCO_CHANGE 2 #define RPM_PCO_SETGLOBAL 3 #define RPM_PCO_MAX 4 struct rr_pco_use { /* use prefix part */ u_int8_t rpu_uselen; u_int8_t rpu_keeplen; u_int8_t rpu_ramask; u_int8_t rpu_raflags; u_int32_t rpu_vltime; u_int32_t rpu_pltime; u_int32_t rpu_flags; struct in6_addr rpu_prefix; } __packed; #define ICMP6_RR_PCOUSE_RAFLAGS_ONLINK 0x80 #define ICMP6_RR_PCOUSE_RAFLAGS_AUTO 0x40 #define ICMP6_RR_PCOUSE_FLAGS_DECRVLTIME htonl(0x80000000) #define ICMP6_RR_PCOUSE_FLAGS_DECRPLTIME htonl(0x40000000) struct rr_result { /* router renumbering result message */ u_int16_t rrr_flags; u_int8_t rrr_ordinal; u_int8_t rrr_matchedlen; u_int32_t rrr_ifid; struct in6_addr rrr_prefix; } __packed; #define ICMP6_RR_RESULT_FLAGS_OOB htons(0x0002) #define ICMP6_RR_RESULT_FLAGS_FORBIDDEN htons(0x0001) /* * icmp6 filter structures. */ struct icmp6_filter { u_int32_t icmp6_filt[8]; }; #define ICMP6_FILTER_SETPASSALL(filterp) \ memset(filterp, 0xff, sizeof(struct icmp6_filter)) #define ICMP6_FILTER_SETBLOCKALL(filterp) \ memset(filterp, 0x00, sizeof(struct icmp6_filter)) #define ICMP6_FILTER_SETPASS(type, filterp) \ (((filterp)->icmp6_filt[(type) >> 5]) |= (1 << ((type) & 31))) #define ICMP6_FILTER_SETBLOCK(type, filterp) \ (((filterp)->icmp6_filt[(type) >> 5]) &= ~(1 << ((type) & 31))) #define ICMP6_FILTER_WILLPASS(type, filterp) \ ((((filterp)->icmp6_filt[(type) >> 5]) & (1 << ((type) & 31))) != 0) #define ICMP6_FILTER_WILLBLOCK(type, filterp) \ ((((filterp)->icmp6_filt[(type) >> 5]) & (1 << ((type) & 31))) == 0) /* * Variables related to this implementation * of the internet control message protocol version 6. */ struct icmp6stat { /* statistics related to icmp6 packets generated */ u_int64_t icp6s_error; /* # of calls to icmp6_error */ u_int64_t icp6s_canterror; /* no error because old was icmp */ u_int64_t icp6s_toofreq; /* no error because rate limitation */ u_int64_t icp6s_outhist[256]; /* statistics related to input message processed */ u_int64_t icp6s_badcode; /* icmp6_code out of range */ u_int64_t icp6s_tooshort; /* packet < sizeof(struct icmp6_hdr) */ u_int64_t icp6s_checksum; /* bad checksum */ u_int64_t icp6s_badlen; /* calculated bound mismatch */ /* * number of responses: this member is inherited from netinet code, but * for netinet6 code, it is already available in icp6s_outhist[]. */ u_int64_t icp6s_reflect; u_int64_t icp6s_inhist[256]; u_int64_t icp6s_nd_toomanyopt; /* too many ND options */ u_int64_t icp6s_odst_unreach_noroute; u_int64_t icp6s_odst_unreach_admin; u_int64_t icp6s_odst_unreach_beyondscope; u_int64_t icp6s_odst_unreach_addr; u_int64_t icp6s_odst_unreach_noport; u_int64_t icp6s_opacket_too_big; u_int64_t icp6s_otime_exceed_transit; u_int64_t icp6s_otime_exceed_reassembly; u_int64_t icp6s_oparamprob_header; u_int64_t icp6s_oparamprob_nextheader; u_int64_t icp6s_oparamprob_option; u_int64_t icp6s_oredirect; /* we regard redirect as an error here */ u_int64_t icp6s_ounknown; u_int64_t icp6s_pmtuchg; /* path MTU changes */ u_int64_t icp6s_nd_badopt; /* bad ND options */ u_int64_t icp6s_badns; /* bad neighbor solicitation */ u_int64_t icp6s_badna; /* bad neighbor advertisement */ u_int64_t icp6s_badrs; /* bad router advertisement */ u_int64_t icp6s_badra; /* bad router advertisement */ u_int64_t icp6s_badredirect; /* bad redirect message */ }; /* * Names for ICMP sysctl objects */ #define ICMPV6CTL_STATS 1 #define ICMPV6CTL_REDIRACCEPT 2 /* accept/process redirects */ #define ICMPV6CTL_REDIRTIMEOUT 3 /* redirect cache time */ #define ICMPV6CTL_ND6_DELAY 8 #define ICMPV6CTL_ND6_UMAXTRIES 9 #define ICMPV6CTL_ND6_MMAXTRIES 10 #define ICMPV6CTL_NODEINFO 13 #define ICMPV6CTL_ERRPPSLIMIT 14 /* ICMPv6 error pps limitation */ #define ICMPV6CTL_ND6_MAXNUDHINT 15 #define ICMPV6CTL_MTUDISC_HIWAT 16 #define ICMPV6CTL_MTUDISC_LOWAT 17 #define ICMPV6CTL_ND6_DEBUG 18 #define ICMPV6CTL_MAXID 19 #define ICMPV6CTL_NAMES { \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { "redirtimeout", CTLTYPE_INT }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { "nd6_delay", CTLTYPE_INT }, \ { "nd6_umaxtries", CTLTYPE_INT }, \ { "nd6_mmaxtries", CTLTYPE_INT }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { "errppslimit", CTLTYPE_INT }, \ { "nd6_maxnudhint", CTLTYPE_INT }, \ { "mtudisc_hiwat", CTLTYPE_INT }, \ { "mtudisc_lowat", CTLTYPE_INT }, \ { "nd6_debug", CTLTYPE_INT }, \ } #define RTF_PROBEMTU RTF_PROTO1 #ifdef _KERNEL #include <sys/percpu.h> enum icmp6stat_counters { icp6s_error, icp6s_canterror, icp6s_toofreq, icp6s_outhist, icp6s_badcode = icp6s_outhist + 256, icp6s_tooshort, icp6s_checksum, icp6s_badlen, icp6s_reflect, icp6s_inhist, icp6s_nd_toomanyopt = icp6s_inhist + 256, icp6s_odst_unreach_noroute, icp6s_odst_unreach_admin, icp6s_odst_unreach_beyondscope, icp6s_odst_unreach_addr, icp6s_odst_unreach_noport, icp6s_opacket_too_big, icp6s_otime_exceed_transit, icp6s_otime_exceed_reassembly, icp6s_oparamprob_header, icp6s_oparamprob_nextheader, icp6s_oparamprob_option, icp6s_oredirect, icp6s_ounknown, icp6s_pmtuchg, icp6s_nd_badopt, icp6s_badns, icp6s_badna, icp6s_badrs, icp6s_badra, icp6s_badredirect, icp6s_ncounters, }; extern struct cpumem *icmp6counters; static inline void icmp6stat_inc(enum icmp6stat_counters c) { counters_inc(icmp6counters, c); } struct rtentry; struct rttimer; struct in6_multi; void icmp6_init(void); void icmp6_paramerror(struct mbuf *, int); struct mbuf *icmp6_do_error(struct mbuf *, int, int, int); void icmp6_error(struct mbuf *, int, int, int); int icmp6_input(struct mbuf **, int *, int, int); void icmp6_fasttimo(void); int icmp6_reflect(struct mbuf **, size_t, struct sockaddr *); void icmp6_prepare(struct mbuf *); void icmp6_redirect_input(struct mbuf *, int); void icmp6_redirect_output(struct mbuf *, struct rtentry *); int icmp6_sysctl(int *, u_int, void *, size_t *, void *, size_t); struct rtentry *icmp6_mtudisc_clone(struct sockaddr_in6 *, u_int, int); struct ip6ctlparam; void icmp6_mtudisc_update(struct ip6ctlparam *, int); void icmp6_mtudisc_callback_register(void (*)(struct sockaddr_in6 *, u_int)); extern int icmp6_redirtimeout; /* cache time for redirect routes */ #endif /* _KERNEL */ #endif /* _NETINET_ICMP6_H_ */
14 14 14 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 /* $OpenBSD: chacha_private.h,v 1.4 2020/07/22 13:54:30 tobhe Exp $ */ /* chacha-merged.c version 20080118 D. J. Bernstein Public domain. */ #include <sys/systm.h> typedef unsigned char u8; typedef unsigned int u32; typedef struct { u32 input[16]; /* could be compressed */ } chacha_ctx; #define U8C(v) (v##U) #define U32C(v) (v##U) #define U8V(v) ((u8)(v) & U8C(0xFF)) #define U32V(v) ((u32)(v) & U32C(0xFFFFFFFF)) #define ROTL32(v, n) \ (U32V((v) << (n)) | ((v) >> (32 - (n)))) #define U8TO32_LITTLE(p) \ (((u32)((p)[0]) ) | \ ((u32)((p)[1]) << 8) | \ ((u32)((p)[2]) << 16) | \ ((u32)((p)[3]) << 24)) #define U32TO8_LITTLE(p, v) \ do { \ (p)[0] = U8V((v) ); \ (p)[1] = U8V((v) >> 8); \ (p)[2] = U8V((v) >> 16); \ (p)[3] = U8V((v) >> 24); \ } while (0) #define ROTATE(v,c) (ROTL32(v,c)) #define XOR(v,w) ((v) ^ (w)) #define PLUS(v,w) (U32V((v) + (w))) #define PLUSONE(v) (PLUS((v),1)) #define QUARTERROUND(a,b,c,d) \ a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \ c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \ a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \ c = PLUS(c,d); b = ROTATE(XOR(b,c), 7); static const char sigma[16] = "expand 32-byte k"; static const char tau[16] = "expand 16-byte k"; static inline void hchacha20(u32 derived_key[8], const u8 nonce[16], const u8 key[32]) { int i; uint32_t x[] = { U8TO32_LITTLE(sigma + 0), U8TO32_LITTLE(sigma + 4), U8TO32_LITTLE(sigma + 8), U8TO32_LITTLE(sigma + 12), U8TO32_LITTLE(key + 0), U8TO32_LITTLE(key + 4), U8TO32_LITTLE(key + 8), U8TO32_LITTLE(key + 12), U8TO32_LITTLE(key + 16), U8TO32_LITTLE(key + 20), U8TO32_LITTLE(key + 24), U8TO32_LITTLE(key + 28), U8TO32_LITTLE(nonce + 0), U8TO32_LITTLE(nonce + 4), U8TO32_LITTLE(nonce + 8), U8TO32_LITTLE(nonce + 12) }; for (i = 20;i > 0;i -= 2) { QUARTERROUND( x[0], x[4], x[8],x[12]) QUARTERROUND( x[1], x[5], x[9],x[13]) QUARTERROUND( x[2], x[6],x[10],x[14]) QUARTERROUND( x[3], x[7],x[11],x[15]) QUARTERROUND( x[0], x[5],x[10],x[15]) QUARTERROUND( x[1], x[6],x[11],x[12]) QUARTERROUND( x[2], x[7], x[8],x[13]) QUARTERROUND( x[3], x[4], x[9],x[14]) } memcpy(derived_key + 0, x + 0, sizeof(u32) * 4); memcpy(derived_key + 4, x + 12, sizeof(u32) * 4); } static void chacha_keysetup(chacha_ctx *x,const u8 *k,u32 kbits) { const char *constants; x->input[4] = U8TO32_LITTLE(k + 0); x->input[5] = U8TO32_LITTLE(k + 4); x->input[6] = U8TO32_LITTLE(k + 8); x->input[7] = U8TO32_LITTLE(k + 12); if (kbits == 256) { /* recommended */ k += 16; constants = sigma; } else { /* kbits == 128 */ constants = tau; } x->input[8] = U8TO32_LITTLE(k + 0); x->input[9] = U8TO32_LITTLE(k + 4); x->input[10] = U8TO32_LITTLE(k + 8); x->input[11] = U8TO32_LITTLE(k + 12); x->input[0] = U8TO32_LITTLE(constants + 0); x->input[1] = U8TO32_LITTLE(constants + 4); x->input[2] = U8TO32_LITTLE(constants + 8); x->input[3] = U8TO32_LITTLE(constants + 12); } static void chacha_ivsetup(chacha_ctx *x, const u8 *iv, const u8 *counter) { x->input[12] = counter == NULL ? 0 : U8TO32_LITTLE(counter + 0); x->input[13] = counter == NULL ? 0 : U8TO32_LITTLE(counter + 4); x->input[14] = U8TO32_LITTLE(iv + 0); x->input[15] = U8TO32_LITTLE(iv + 4); } static void chacha_encrypt_bytes(chacha_ctx *x,const u8 *m,u8 *c,u32 bytes) { u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; u32 j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15; u8 *ctarget = NULL; u8 tmp[64]; u_int i; if (!bytes) return; j0 = x->input[0]; j1 = x->input[1]; j2 = x->input[2]; j3 = x->input[3]; j4 = x->input[4]; j5 = x->input[5]; j6 = x->input[6]; j7 = x->input[7]; j8 = x->input[8]; j9 = x->input[9]; j10 = x->input[10]; j11 = x->input[11]; j12 = x->input[12]; j13 = x->input[13]; j14 = x->input[14]; j15 = x->input[15]; for (;;) { if (bytes < 64) { for (i = 0;i < bytes;++i) tmp[i] = m[i]; m = tmp; ctarget = c; c = tmp; } x0 = j0; x1 = j1; x2 = j2; x3 = j3; x4 = j4; x5 = j5; x6 = j6; x7 = j7; x8 = j8; x9 = j9; x10 = j10; x11 = j11; x12 = j12; x13 = j13; x14 = j14; x15 = j15; for (i = 20;i > 0;i -= 2) { QUARTERROUND( x0, x4, x8,x12) QUARTERROUND( x1, x5, x9,x13) QUARTERROUND( x2, x6,x10,x14) QUARTERROUND( x3, x7,x11,x15) QUARTERROUND( x0, x5,x10,x15) QUARTERROUND( x1, x6,x11,x12) QUARTERROUND( x2, x7, x8,x13) QUARTERROUND( x3, x4, x9,x14) } x0 = PLUS(x0,j0); x1 = PLUS(x1,j1); x2 = PLUS(x2,j2); x3 = PLUS(x3,j3); x4 = PLUS(x4,j4); x5 = PLUS(x5,j5); x6 = PLUS(x6,j6); x7 = PLUS(x7,j7); x8 = PLUS(x8,j8); x9 = PLUS(x9,j9); x10 = PLUS(x10,j10); x11 = PLUS(x11,j11); x12 = PLUS(x12,j12); x13 = PLUS(x13,j13); x14 = PLUS(x14,j14); x15 = PLUS(x15,j15); #ifndef KEYSTREAM_ONLY x0 = XOR(x0,U8TO32_LITTLE(m + 0)); x1 = XOR(x1,U8TO32_LITTLE(m + 4)); x2 = XOR(x2,U8TO32_LITTLE(m + 8)); x3 = XOR(x3,U8TO32_LITTLE(m + 12)); x4 = XOR(x4,U8TO32_LITTLE(m + 16)); x5 = XOR(x5,U8TO32_LITTLE(m + 20)); x6 = XOR(x6,U8TO32_LITTLE(m + 24)); x7 = XOR(x7,U8TO32_LITTLE(m + 28)); x8 = XOR(x8,U8TO32_LITTLE(m + 32)); x9 = XOR(x9,U8TO32_LITTLE(m + 36)); x10 = XOR(x10,U8TO32_LITTLE(m + 40)); x11 = XOR(x11,U8TO32_LITTLE(m + 44)); x12 = XOR(x12,U8TO32_LITTLE(m + 48)); x13 = XOR(x13,U8TO32_LITTLE(m + 52)); x14 = XOR(x14,U8TO32_LITTLE(m + 56)); x15 = XOR(x15,U8TO32_LITTLE(m + 60)); #endif j12 = PLUSONE(j12); if (!j12) { j13 = PLUSONE(j13); /* stopping at 2^70 bytes per nonce is user's responsibility */ } U32TO8_LITTLE(c + 0,x0); U32TO8_LITTLE(c + 4,x1); U32TO8_LITTLE(c + 8,x2); U32TO8_LITTLE(c + 12,x3); U32TO8_LITTLE(c + 16,x4); U32TO8_LITTLE(c + 20,x5); U32TO8_LITTLE(c + 24,x6); U32TO8_LITTLE(c + 28,x7); U32TO8_LITTLE(c + 32,x8); U32TO8_LITTLE(c + 36,x9); U32TO8_LITTLE(c + 40,x10); U32TO8_LITTLE(c + 44,x11); U32TO8_LITTLE(c + 48,x12); U32TO8_LITTLE(c + 52,x13); U32TO8_LITTLE(c + 56,x14); U32TO8_LITTLE(c + 60,x15); if (bytes <= 64) { if (bytes < 64) { for (i = 0;i < bytes;++i) ctarget[i] = c[i]; } x->input[12] = j12; x->input[13] = j13; return; } bytes -= 64; c += 64; #ifndef KEYSTREAM_ONLY m += 64; #endif } }
90 59 90 2 22 2 10 31 13 19 31 212 3 16 200 100 5 12 26 50 7 4 19 151 36 6 21 15 28 4 6 7 5 6 7 3 5 158 1 10 147 148 148 56 4 2 1 1 6 4 2 38 14 14 6 8 14 60 45 18 28 32 60 60 60 301 240 194 313 189 133 62 62 62 3 6 7 2 1 183 169 20 3 18 7 14 3 4 1 1 22 57 8 49 7 1 56 53 5 6 47 3 3 2 3 2 46 48 20 30 50 90 194 194 219 218 219 135 85 218 52 52 52 1 11 2 52 52 3 3 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 /* $OpenBSD: in_pcb.c,v 1.275 2022/09/03 22:43:38 mvs Exp $ */ /* $NetBSD: in_pcb.c,v 1.25 1996/02/13 23:41:53 christos Exp $ */ /* * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 * * NRL grants permission for redistribution and use in source and binary * forms, with or without modification, of the software and documentation * created at NRL provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgements: * This product includes software developed by the University of * California, Berkeley and its contributors. * This product includes software developed at the Information * Technology Division, US Naval Research Laboratory. * 4. Neither the name of the NRL nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * The views and conclusions contained in the software and documentation * are those of the authors and should not be interpreted as representing * official policies, either expressed or implied, of the US Naval * Research Laboratory (NRL). */ #include "pf.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/domain.h> #include <sys/mount.h> #include <sys/pool.h> #include <sys/proc.h> #include <net/if.h> #include <net/if_var.h> #include <net/pfvar.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/in_pcb.h> #ifdef IPSEC #include <netinet/ip_esp.h> #endif /* IPSEC */ #include "stoeplitz.h" #if NSTOEPLITZ > 0 #include <net/toeplitz.h> #endif const struct in_addr zeroin_addr; union { struct in_addr za_in; struct in6_addr za_in6; } zeroin46_addr; /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ int ipport_firstauto = IPPORT_RESERVED; int ipport_lastauto = IPPORT_USERRESERVED; int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; int ipport_hilastauto = IPPORT_HILASTAUTO; struct baddynamicports baddynamicports; struct baddynamicports rootonlyports; struct pool inpcb_pool; void in_pcbhash_insert(struct inpcb *); struct inpcb *in_pcbhash_lookup(struct inpcbtable *, u_int, const struct in_addr *, u_short, const struct in_addr *, u_short); int in_pcbresize(struct inpcbtable *, int); #define INPCBHASH_LOADFACTOR(_x) (((_x) * 3) / 4) struct inpcbhead *in_pcbhash(struct inpcbtable *, u_int, const struct in_addr *, u_short, const struct in_addr *, u_short); struct inpcbhead *in_pcblhash(struct inpcbtable *, u_int, u_short); /* * in_pcb is used for inet and inet6. in6_pcb only contains special * IPv6 cases. So the internet initializer is used for both domains. */ void in_init(void) { pool_init(&inpcb_pool, sizeof(struct inpcb), 0, IPL_SOFTNET, 0, "inpcb", NULL); } struct inpcbhead * in_pcbhash(struct inpcbtable *table, u_int rdomain, const struct in_addr *faddr, u_short fport, const struct in_addr *laddr, u_short lport) { SIPHASH_CTX ctx; u_int32_t nrdom = htonl(rdomain); SipHash24_Init(&ctx, &table->inpt_key); SipHash24_Update(&ctx, &nrdom, sizeof(nrdom)); SipHash24_Update(&ctx, faddr, sizeof(*faddr)); SipHash24_Update(&ctx, &fport, sizeof(fport)); SipHash24_Update(&ctx, laddr, sizeof(*laddr)); SipHash24_Update(&ctx, &lport, sizeof(lport)); return (&table->inpt_hashtbl[SipHash24_End(&ctx) & table->inpt_mask]); } struct inpcbhead * in_pcblhash(struct inpcbtable *table, u_int rdomain, u_short lport) { SIPHASH_CTX ctx; u_int32_t nrdom = htonl(rdomain); SipHash24_Init(&ctx, &table->inpt_lkey); SipHash24_Update(&ctx, &nrdom, sizeof(nrdom)); SipHash24_Update(&ctx, &lport, sizeof(lport)); return (&table->inpt_lhashtbl[SipHash24_End(&ctx) & table->inpt_lmask]); } void in_pcbinit(struct inpcbtable *table, int hashsize) { mtx_init(&table->inpt_mtx, IPL_SOFTNET); rw_init(&table->inpt_notify, "inpnotify"); TAILQ_INIT(&table->inpt_queue); table->inpt_hashtbl = hashinit(hashsize, M_PCB, M_WAITOK, &table->inpt_mask); table->inpt_lhashtbl = hashinit(hashsize, M_PCB, M_WAITOK, &table->inpt_lmask); table->inpt_count = 0; table->inpt_size = hashsize; arc4random_buf(&table->inpt_key, sizeof(table->inpt_key)); arc4random_buf(&table->inpt_lkey, sizeof(table->inpt_lkey)); } /* * Check if the specified port is invalid for dynamic allocation. */ int in_baddynamic(u_int16_t port, u_int16_t proto) { switch (proto) { case IPPROTO_TCP: return (DP_ISSET(baddynamicports.tcp, port)); case IPPROTO_UDP: #ifdef IPSEC /* Cannot preset this as it is a sysctl */ if (port == udpencap_port) return (1); #endif return (DP_ISSET(baddynamicports.udp, port)); default: return (0); } } int in_rootonly(u_int16_t port, u_int16_t proto) { switch (proto) { case IPPROTO_TCP: return (port < IPPORT_RESERVED || DP_ISSET(rootonlyports.tcp, port)); case IPPROTO_UDP: return (port < IPPORT_RESERVED || DP_ISSET(rootonlyports.udp, port)); default: return (0); } } int in_pcballoc(struct socket *so, struct inpcbtable *table) { struct inpcb *inp; inp = pool_get(&inpcb_pool, PR_NOWAIT|PR_ZERO); if (inp == NULL) return (ENOBUFS); inp->inp_table = table; inp->inp_socket = so; refcnt_init_trace(&inp->inp_refcnt, DT_REFCNT_IDX_INPCB); mtx_init(&inp->inp_mtx, IPL_SOFTNET); inp->inp_seclevel[SL_AUTH] = IPSEC_AUTH_LEVEL_DEFAULT; inp->inp_seclevel[SL_ESP_TRANS] = IPSEC_ESP_TRANS_LEVEL_DEFAULT; inp->inp_seclevel[SL_ESP_NETWORK] = IPSEC_ESP_NETWORK_LEVEL_DEFAULT; inp->inp_seclevel[SL_IPCOMP] = IPSEC_IPCOMP_LEVEL_DEFAULT; inp->inp_rtableid = curproc->p_p->ps_rtableid; inp->inp_hops = -1; #ifdef INET6 /* * Small change in this function to set the INP_IPV6 flag so routines * outside pcb-specific routines don't need to use sotopf(), and all * of its pointer chasing, later. */ if (sotopf(so) == PF_INET6) inp->inp_flags = INP_IPV6; inp->inp_cksum6 = -1; #endif /* INET6 */ mtx_enter(&table->inpt_mtx); if (table->inpt_count++ > INPCBHASH_LOADFACTOR(table->inpt_size)) (void)in_pcbresize(table, table->inpt_size * 2); TAILQ_INSERT_HEAD(&table->inpt_queue, inp, inp_queue); in_pcbhash_insert(inp); mtx_leave(&table->inpt_mtx); so->so_pcb = inp; return (0); } int in_pcbbind(struct inpcb *inp, struct mbuf *nam, struct proc *p) { struct socket *so = inp->inp_socket; u_int16_t lport = 0; int wild = 0; void *laddr = &zeroin46_addr; int error; if (inp->inp_lport) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 && ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 || (so->so_options & SO_ACCEPTCONN) == 0)) wild = INPLOOKUP_WILDCARD; switch (sotopf(so)) { #ifdef INET6 case PF_INET6: if (!IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6)) return (EINVAL); wild |= INPLOOKUP_IPV6; if (nam) { struct sockaddr_in6 *sin6; if ((error = in6_nam2sin6(nam, &sin6))) return (error); if ((error = in6_pcbaddrisavail(inp, sin6, wild, p))) return (error); laddr = &sin6->sin6_addr; lport = sin6->sin6_port; } break; #endif case PF_INET: if (inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); if (nam) { struct sockaddr_in *sin; if ((error = in_nam2sin(nam, &sin))) return (error); if ((error = in_pcbaddrisavail(inp, sin, wild, p))) return (error); laddr = &sin->sin_addr; lport = sin->sin_port; } break; default: return (EINVAL); } if (lport == 0) { if ((error = in_pcbpickport(&lport, laddr, wild, inp, p))) return (error); } else { if (in_rootonly(ntohs(lport), so->so_proto->pr_protocol) && suser(p) != 0) return (EACCES); } if (nam) { switch (sotopf(so)) { #ifdef INET6 case PF_INET6: inp->inp_laddr6 = *(struct in6_addr *)laddr; break; #endif case PF_INET: inp->inp_laddr = *(struct in_addr *)laddr; break; } } inp->inp_lport = lport; in_pcbrehash(inp); return (0); } int in_pcbaddrisavail(struct inpcb *inp, struct sockaddr_in *sin, int wild, struct proc *p) { struct socket *so = inp->inp_socket; struct inpcbtable *table = inp->inp_table; u_int16_t lport = sin->sin_port; int reuseport = (so->so_options & SO_REUSEPORT); if (IN_MULTICAST(sin->sin_addr.s_addr)) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if (so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (sin->sin_addr.s_addr != INADDR_ANY) { /* * we must check that we are binding to an address we * own except when: * - SO_BINDANY is set or * - we are binding a UDP socket to 255.255.255.255 or * - we are binding a UDP socket to one of our broadcast * addresses */ if (!ISSET(so->so_options, SO_BINDANY) && !(so->so_type == SOCK_DGRAM && sin->sin_addr.s_addr == INADDR_BROADCAST) && !(so->so_type == SOCK_DGRAM && in_broadcast(sin->sin_addr, inp->inp_rtableid))) { struct ifaddr *ia; sin->sin_port = 0; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); ia = ifa_ifwithaddr(sintosa(sin), inp->inp_rtableid); sin->sin_port = lport; if (ia == NULL) return (EADDRNOTAVAIL); } } if (lport) { struct inpcb *t; int error = 0; if (so->so_euid && !IN_MULTICAST(sin->sin_addr.s_addr)) { t = in_pcblookup_local(table, &sin->sin_addr, lport, INPLOOKUP_WILDCARD, inp->inp_rtableid); if (t && (so->so_euid != t->inp_socket->so_euid)) error = EADDRINUSE; in_pcbunref(t); if (error) return (error); } t = in_pcblookup_local(table, &sin->sin_addr, lport, wild, inp->inp_rtableid); if (t && (reuseport & t->inp_socket->so_options) == 0) error = EADDRINUSE; in_pcbunref(t); if (error) return (error); } return (0); } int in_pcbpickport(u_int16_t *lport, void *laddr, int wild, struct inpcb *inp, struct proc *p) { struct socket *so = inp->inp_socket; struct inpcbtable *table = inp->inp_table; struct inpcb *t; u_int16_t first, last, lower, higher, candidate, localport; int count; if (inp->inp_flags & INP_HIGHPORT) { first = ipport_hifirstauto; /* sysctl */ last = ipport_hilastauto; } else if (inp->inp_flags & INP_LOWPORT) { if (suser(p)) return (EACCES); first = IPPORT_RESERVED-1; /* 1023 */ last = 600; /* not IPPORT_RESERVED/2 */ } else { first = ipport_firstauto; /* sysctl */ last = ipport_lastauto; } if (first < last) { lower = first; higher = last; } else { lower = last; higher = first; } /* * Simple check to ensure all ports are not used up causing * a deadlock here. */ count = higher - lower; candidate = lower + arc4random_uniform(count); t = NULL; do { in_pcbunref(t); do { if (count-- < 0) /* completely used? */ return (EADDRNOTAVAIL); ++candidate; if (candidate < lower || candidate > higher) candidate = lower; localport = htons(candidate); } while (in_baddynamic(candidate, so->so_proto->pr_protocol)); t = in_pcblookup_local(table, laddr, localport, wild, inp->inp_rtableid); } while (t != NULL); *lport = localport; return (0); } /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in_pcbconnect(struct inpcb *inp, struct mbuf *nam) { struct in_addr ina; struct sockaddr_in *sin; struct inpcb *t; int error; #ifdef INET6 if (sotopf(inp->inp_socket) == PF_INET6) return (in6_pcbconnect(inp, nam)); KASSERT((inp->inp_flags & INP_IPV6) == 0); #endif /* INET6 */ if ((error = in_nam2sin(nam, &sin))) return (error); if (sin->sin_port == 0) return (EADDRNOTAVAIL); error = in_pcbselsrc(&ina, sin, inp); if (error) return (error); t = in_pcblookup(inp->inp_table, sin->sin_addr, sin->sin_port, ina, inp->inp_lport, inp->inp_rtableid); if (t != NULL) { in_pcbunref(t); return (EADDRINUSE); } KASSERT(inp->inp_laddr.s_addr == INADDR_ANY || inp->inp_lport); if (inp->inp_laddr.s_addr == INADDR_ANY) { if (inp->inp_lport == 0) { error = in_pcbbind(inp, NULL, curproc); if (error) return (error); t = in_pcblookup(inp->inp_table, sin->sin_addr, sin->sin_port, ina, inp->inp_lport, inp->inp_rtableid); if (t != NULL) { inp->inp_lport = 0; in_pcbunref(t); return (EADDRINUSE); } } inp->inp_laddr = ina; } inp->inp_faddr = sin->sin_addr; inp->inp_fport = sin->sin_port; in_pcbrehash(inp); #if NSTOEPLITZ > 0 inp->inp_flowid = stoeplitz_ip4port(inp->inp_faddr.s_addr, inp->inp_laddr.s_addr, inp->inp_fport, inp->inp_lport); #endif return (0); } void in_pcbdisconnect(struct inpcb *inp) { #if NPF > 0 if (inp->inp_pf_sk) { pf_remove_divert_state(inp->inp_pf_sk); /* pf_remove_divert_state() may have detached the state */ pf_inp_unlink(inp); } #endif switch (sotopf(inp->inp_socket)) { #ifdef INET6 case PF_INET6: inp->inp_faddr6 = in6addr_any; break; #endif case PF_INET: inp->inp_faddr.s_addr = INADDR_ANY; break; } inp->inp_fport = 0; inp->inp_flowid = 0; in_pcbrehash(inp); if (inp->inp_socket->so_state & SS_NOFDREF) in_pcbdetach(inp); } void in_pcbdetach(struct inpcb *inp) { struct socket *so = inp->inp_socket; struct inpcbtable *table = inp->inp_table; so->so_pcb = NULL; /* * As long as the NET_LOCK() is the default lock for Internet * sockets, do not release it to not introduce new sleeping * points. */ sofree(so, 1); m_freem(inp->inp_options); if (inp->inp_route.ro_rt) { rtfree(inp->inp_route.ro_rt); inp->inp_route.ro_rt = NULL; } #ifdef INET6 if (inp->inp_flags & INP_IPV6) { ip6_freepcbopts(inp->inp_outputopts6); ip6_freemoptions(inp->inp_moptions6); } else #endif ip_freemoptions(inp->inp_moptions); #if NPF > 0 if (inp->inp_pf_sk) { pf_remove_divert_state(inp->inp_pf_sk); /* pf_remove_divert_state() may have detached the state */ pf_inp_unlink(inp); } #endif mtx_enter(&table->inpt_mtx); LIST_REMOVE(inp, inp_lhash); LIST_REMOVE(inp, inp_hash); TAILQ_REMOVE(&table->inpt_queue, inp, inp_queue); table->inpt_count--; mtx_leave(&table->inpt_mtx); in_pcbunref(inp); } struct inpcb * in_pcbref(struct inpcb *inp) { if (inp == NULL) return NULL; refcnt_take(&inp->inp_refcnt); return inp; } void in_pcbunref(struct inpcb *inp) { if (inp == NULL) return; if (refcnt_rele(&inp->inp_refcnt) == 0) return; KASSERT((LIST_NEXT(inp, inp_hash) == NULL) || (LIST_NEXT(inp, inp_hash) == _Q_INVALID)); KASSERT((LIST_NEXT(inp, inp_lhash) == NULL) || (LIST_NEXT(inp, inp_lhash) == _Q_INVALID)); KASSERT((TAILQ_NEXT(inp, inp_queue) == NULL) || (TAILQ_NEXT(inp, inp_queue) == _Q_INVALID)); pool_put(&inpcb_pool, inp); } void in_setsockaddr(struct inpcb *inp, struct mbuf *nam) { struct sockaddr_in *sin; nam->m_len = sizeof(*sin); sin = mtod(nam, struct sockaddr_in *); memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_port = inp->inp_lport; sin->sin_addr = inp->inp_laddr; } void in_setpeeraddr(struct inpcb *inp, struct mbuf *nam) { struct sockaddr_in *sin; #ifdef INET6 if (sotopf(inp->inp_socket) == PF_INET6) { in6_setpeeraddr(inp, nam); return; } #endif /* INET6 */ nam->m_len = sizeof(*sin); sin = mtod(nam, struct sockaddr_in *); memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_port = inp->inp_fport; sin->sin_addr = inp->inp_faddr; } int in_sockaddr(struct socket *so, struct mbuf *nam) { struct inpcb *inp; inp = sotoinpcb(so); in_setsockaddr(inp, nam); return (0); } int in_peeraddr(struct socket *so, struct mbuf *nam) { struct inpcb *inp; inp = sotoinpcb(so); in_setpeeraddr(inp, nam); return (0); } /* * Pass some notification to all connections of a protocol * associated with address dst. The "usual action" will be * taken, depending on the ctlinput cmd. The caller must filter any * cmds that are uninteresting (e.g., no error in the map). * Call the protocol specific routine (if any) to report * any errors for each matching socket. */ void in_pcbnotifyall(struct inpcbtable *table, struct sockaddr *dst, u_int rtable, int errno, void (*notify)(struct inpcb *, int)) { SIMPLEQ_HEAD(, inpcb) inpcblist; struct inpcb *inp; struct in_addr faddr; u_int rdomain; if (dst->sa_family != AF_INET) return; faddr = satosin(dst)->sin_addr; if (faddr.s_addr == INADDR_ANY) return; if (notify == NULL) return; /* * Use a temporary notify list protected by rwlock to run over * selected PCB. This is necessary as the list of all PCB is * protected by a mutex. Notify may call ip_output() eventually * which may sleep as pf lock is a rwlock. Also the SRP * implementation of the routing table might sleep. * The same inp_notify list entry and inpt_notify rwlock are * used for UDP multicast and raw IP delivery. */ SIMPLEQ_INIT(&inpcblist); rdomain = rtable_l2(rtable); rw_enter_write(&table->inpt_notify); mtx_enter(&table->inpt_mtx); TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { #ifdef INET6 if (inp->inp_flags & INP_IPV6) continue; #endif if (inp->inp_faddr.s_addr != faddr.s_addr || rtable_l2(inp->inp_rtableid) != rdomain || inp->inp_socket == NULL) { continue; } in_pcbref(inp); SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify); } mtx_leave(&table->inpt_mtx); while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) { SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify); (*notify)(inp, errno); in_pcbunref(inp); } rw_exit_write(&table->inpt_notify); } /* * Check for alternatives when higher level complains * about service problems. For now, invalidate cached * routing information. If the route was created dynamically * (by a redirect), time to try a default gateway again. */ void in_losing(struct inpcb *inp) { struct rtentry *rt = inp->inp_route.ro_rt; if (rt) { inp->inp_route.ro_rt = NULL; if (rt->rt_flags & RTF_DYNAMIC) { struct ifnet *ifp; ifp = if_get(rt->rt_ifidx); /* * If the interface is gone, all its attached * route entries have been removed from the table, * so we're dealing with a stale cache and have * nothing to do. */ if (ifp != NULL) rtdeletemsg(rt, ifp, inp->inp_rtableid); if_put(ifp); } /* * A new route can be allocated * the next time output is attempted. * rtfree() needs to be called in anycase because the inp * is still holding a reference to rt. */ rtfree(rt); } } /* * After a routing change, flush old routing * and allocate a (hopefully) better one. */ void in_rtchange(struct inpcb *inp, int errno) { if (inp->inp_route.ro_rt) { rtfree(inp->inp_route.ro_rt); inp->inp_route.ro_rt = NULL; /* * A new route can be allocated the next time * output is attempted. */ } } struct inpcb * in_pcblookup_local(struct inpcbtable *table, void *laddrp, u_int lport_arg, int flags, u_int rtable) { struct inpcb *inp, *match = NULL; int matchwild = 3, wildcard; u_int16_t lport = lport_arg; struct in_addr laddr = *(struct in_addr *)laddrp; #ifdef INET6 struct in6_addr *laddr6 = (struct in6_addr *)laddrp; #endif struct inpcbhead *head; u_int rdomain; rdomain = rtable_l2(rtable); mtx_enter(&table->inpt_mtx); head = in_pcblhash(table, rdomain, lport); LIST_FOREACH(inp, head, inp_lhash) { if (rtable_l2(inp->inp_rtableid) != rdomain) continue; if (inp->inp_lport != lport) continue; wildcard = 0; #ifdef INET6 if (ISSET(flags, INPLOOKUP_IPV6)) { if (!ISSET(inp->inp_flags, INP_IPV6)) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->inp_faddr6)) wildcard++; if (!IN6_ARE_ADDR_EQUAL(&inp->inp_laddr6, laddr6)) { if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6) || IN6_IS_ADDR_UNSPECIFIED(laddr6)) wildcard++; else continue; } } else #endif /* INET6 */ { #ifdef INET6 if (ISSET(inp->inp_flags, INP_IPV6)) continue; #endif /* INET6 */ if (inp->inp_faddr.s_addr != INADDR_ANY) wildcard++; if (inp->inp_laddr.s_addr != laddr.s_addr) { if (inp->inp_laddr.s_addr == INADDR_ANY || laddr.s_addr == INADDR_ANY) wildcard++; else continue; } } if ((!wildcard || (flags & INPLOOKUP_WILDCARD)) && wildcard < matchwild) { match = inp; if ((matchwild = wildcard) == 0) break; } } in_pcbref(match); mtx_leave(&table->inpt_mtx); return (match); } struct rtentry * in_pcbrtentry(struct inpcb *inp) { struct route *ro; ro = &inp->inp_route; /* check if route is still valid */ if (!rtisvalid(ro->ro_rt)) { rtfree(ro->ro_rt); ro->ro_rt = NULL; } /* * No route yet, so try to acquire one. */ if (ro->ro_rt == NULL) { #ifdef INET6 memset(ro, 0, sizeof(struct route_in6)); #else memset(ro, 0, sizeof(struct route)); #endif switch(sotopf(inp->inp_socket)) { #ifdef INET6 case PF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_faddr6)) break; ro->ro_dst.sa_family = AF_INET6; ro->ro_dst.sa_len = sizeof(struct sockaddr_in6); satosin6(&ro->ro_dst)->sin6_addr = inp->inp_faddr6; ro->ro_tableid = inp->inp_rtableid; ro->ro_rt = rtalloc_mpath(&ro->ro_dst, &inp->inp_laddr6.s6_addr32[0], ro->ro_tableid); break; #endif /* INET6 */ case PF_INET: if (inp->inp_faddr.s_addr == INADDR_ANY) break; ro->ro_dst.sa_family = AF_INET; ro->ro_dst.sa_len = sizeof(struct sockaddr_in); satosin(&ro->ro_dst)->sin_addr = inp->inp_faddr; ro->ro_tableid = inp->inp_rtableid; ro->ro_rt = rtalloc_mpath(&ro->ro_dst, &inp->inp_laddr.s_addr, ro->ro_tableid); break; } } return (ro->ro_rt); } /* * Return an IPv4 address, which is the most appropriate for a given * destination. * If necessary, this function lookups the routing table and returns * an entry to the caller for later use. */ int in_pcbselsrc(struct in_addr *insrc, struct sockaddr_in *sin, struct inpcb *inp) { struct ip_moptions *mopts = inp->inp_moptions; struct route *ro = &inp->inp_route; struct in_addr *laddr = &inp->inp_laddr; u_int rtableid = inp->inp_rtableid; struct sockaddr *ip4_source = NULL; struct sockaddr_in *sin2; struct in_ifaddr *ia = NULL; /* * If the socket(if any) is already bound, use that bound address * unless it is INADDR_ANY or INADDR_BROADCAST. */ if (laddr->s_addr != INADDR_ANY && laddr->s_addr != INADDR_BROADCAST) { *insrc = *laddr; return (0); } /* * If the destination address is multicast or limited * broadcast (255.255.255.255) and an outgoing interface has * been set as a multicast option, use the address of that * interface as our source address. */ if ((IN_MULTICAST(sin->sin_addr.s_addr) || sin->sin_addr.s_addr == INADDR_BROADCAST) && mopts != NULL) { struct ifnet *ifp; ifp = if_get(mopts->imo_ifidx); if (ifp != NULL) { if (ifp->if_rdomain == rtable_l2(rtableid)) IFP_TO_IA(ifp, ia); if (ia == NULL) { if_put(ifp); return (EADDRNOTAVAIL); } *insrc = ia->ia_addr.sin_addr; if_put(ifp); return (0); } } /* * If route is known or can be allocated now, * our src addr is taken from the i/f, else punt. */ if (!rtisvalid(ro->ro_rt) || (ro->ro_tableid != rtableid) || (satosin(&ro->ro_dst)->sin_addr.s_addr != sin->sin_addr.s_addr)) { rtfree(ro->ro_rt); ro->ro_rt = NULL; } if (ro->ro_rt == NULL) { /* No route yet, so try to acquire one */ ro->ro_dst.sa_family = AF_INET; ro->ro_dst.sa_len = sizeof(struct sockaddr_in); satosin(&ro->ro_dst)->sin_addr = sin->sin_addr; ro->ro_tableid = rtableid; ro->ro_rt = rtalloc_mpath(&ro->ro_dst, NULL, ro->ro_tableid); /* * It is important to zero out the rest of the * struct sockaddr_in when mixing v6 & v4! */ sin2 = satosin(&ro->ro_dst); memset(sin2->sin_zero, 0, sizeof(sin2->sin_zero)); } /* * If we found a route, use the address * corresponding to the outgoing interface. */ if (ro->ro_rt != NULL) ia = ifatoia(ro->ro_rt->rt_ifa); /* * Use preferred source address if : * - destination is not onlink * - preferred source address is set * - output interface is UP */ if (ro->ro_rt && !(ro->ro_rt->rt_flags & RTF_LLINFO) && !(ro->ro_rt->rt_flags & RTF_HOST)) { ip4_source = rtable_getsource(rtableid, AF_INET); if (ip4_source != NULL) { struct ifaddr *ifa; if ((ifa = ifa_ifwithaddr(ip4_source, rtableid)) != NULL && ISSET(ifa->ifa_ifp->if_flags, IFF_UP)) { *insrc = satosin(ip4_source)->sin_addr; return (0); } } } if (ia == NULL) return (EADDRNOTAVAIL); *insrc = ia->ia_addr.sin_addr; return (0); } void in_pcbrehash(struct inpcb *inp) { struct inpcbtable *table = inp->inp_table; mtx_enter(&table->inpt_mtx); LIST_REMOVE(inp, inp_lhash); LIST_REMOVE(inp, inp_hash); in_pcbhash_insert(inp); mtx_leave(&table->inpt_mtx); } void in_pcbhash_insert(struct inpcb *inp) { struct inpcbtable *table = inp->inp_table; struct inpcbhead *head; NET_ASSERT_LOCKED(); MUTEX_ASSERT_LOCKED(&table->inpt_mtx); head = in_pcblhash(table, inp->inp_rtableid, inp->inp_lport); LIST_INSERT_HEAD(head, inp, inp_lhash); #ifdef INET6 if (inp->inp_flags & INP_IPV6) head = in6_pcbhash(table, rtable_l2(inp->inp_rtableid), &inp->inp_faddr6, inp->inp_fport, &inp->inp_laddr6, inp->inp_lport); else #endif /* INET6 */ head = in_pcbhash(table, rtable_l2(inp->inp_rtableid), &inp->inp_faddr, inp->inp_fport, &inp->inp_laddr, inp->inp_lport); LIST_INSERT_HEAD(head, inp, inp_hash); } struct inpcb * in_pcbhash_lookup(struct inpcbtable *table, u_int rdomain, const struct in_addr *faddr, u_short fport, const struct in_addr *laddr, u_short lport) { struct inpcbhead *head; struct inpcb *inp; NET_ASSERT_LOCKED(); MUTEX_ASSERT_LOCKED(&table->inpt_mtx); head = in_pcbhash(table, rdomain, faddr, fport, laddr, lport); LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 if (ISSET(inp->inp_flags, INP_IPV6)) continue; #endif if (inp->inp_fport == fport && inp->inp_lport == lport && inp->inp_faddr.s_addr == faddr->s_addr && inp->inp_laddr.s_addr == laddr->s_addr && rtable_l2(inp->inp_rtableid) == rdomain) { break; } } if (inp != NULL) { /* * Move this PCB to the head of hash chain so that * repeated accesses are quicker. This is analogous to * the historic single-entry PCB cache. */ if (inp != LIST_FIRST(head)) { LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); } } return (inp); } int in_pcbresize(struct inpcbtable *table, int hashsize) { u_long nmask, nlmask; int osize; void *nhashtbl, *nlhashtbl, *ohashtbl, *olhashtbl; struct inpcb *inp; MUTEX_ASSERT_LOCKED(&table->inpt_mtx); ohashtbl = table->inpt_hashtbl; olhashtbl = table->inpt_lhashtbl; osize = table->inpt_size; nhashtbl = hashinit(hashsize, M_PCB, M_NOWAIT, &nmask); if (nhashtbl == NULL) return ENOBUFS; nlhashtbl = hashinit(hashsize, M_PCB, M_NOWAIT, &nlmask); if (nlhashtbl == NULL) { hashfree(nhashtbl, hashsize, M_PCB); return ENOBUFS; } table->inpt_hashtbl = nhashtbl; table->inpt_lhashtbl = nlhashtbl; table->inpt_mask = nmask; table->inpt_lmask = nlmask; table->inpt_size = hashsize; arc4random_buf(&table->inpt_key, sizeof(table->inpt_key)); arc4random_buf(&table->inpt_lkey, sizeof(table->inpt_lkey)); TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { LIST_REMOVE(inp, inp_lhash); LIST_REMOVE(inp, inp_hash); in_pcbhash_insert(inp); } hashfree(ohashtbl, osize, M_PCB); hashfree(olhashtbl, osize, M_PCB); return (0); } #ifdef DIAGNOSTIC int in_pcbnotifymiss = 0; #endif /* * The in(6)_pcblookup functions are used to locate connected sockets * quickly: * faddr.fport <-> laddr.lport * No wildcard matching is done so that listening sockets are not found. * If the functions return NULL in(6)_pcblookup_listen can be used to * find a listening/bound socket that may accept the connection. * After those two lookups no other are necessary. */ struct inpcb * in_pcblookup(struct inpcbtable *table, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, u_int rtable) { struct inpcb *inp; u_int rdomain; rdomain = rtable_l2(rtable); mtx_enter(&table->inpt_mtx); inp = in_pcbhash_lookup(table, rdomain, &faddr, fport, &laddr, lport); in_pcbref(inp); mtx_leave(&table->inpt_mtx); #ifdef DIAGNOSTIC if (inp == NULL && in_pcbnotifymiss) { printf("%s: faddr=%08x fport=%d laddr=%08x lport=%d rdom=%u\n", __func__, ntohl(faddr.s_addr), ntohs(fport), ntohl(laddr.s_addr), ntohs(lport), rdomain); } #endif return (inp); } /* * The in(6)_pcblookup_listen functions are used to locate listening * sockets quickly. This are sockets with unspecified foreign address * and port: * *.* <-> laddr.lport * *.* <-> *.lport */ struct inpcb * in_pcblookup_listen(struct inpcbtable *table, struct in_addr laddr, u_int lport_arg, struct mbuf *m, u_int rtable) { const struct in_addr *key1, *key2; struct inpcb *inp; u_int16_t lport = lport_arg; u_int rdomain; key1 = &laddr; key2 = &zeroin_addr; #if NPF > 0 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { struct pf_divert *divert; divert = pf_find_divert(m); KASSERT(divert != NULL); switch (divert->type) { case PF_DIVERT_TO: key1 = key2 = &divert->addr.v4; lport = divert->port; break; case PF_DIVERT_REPLY: return (NULL); default: panic("%s: unknown divert type %d, mbuf %p, divert %p", __func__, divert->type, m, divert); } } else if (m && m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) { /* * Redirected connections should not be treated the same * as connections directed to 127.0.0.0/8 since localhost * can only be accessed from the host itself. * For example portmap(8) grants more permissions for * connections to the socket bound to 127.0.0.1 than * to the * socket. */ key1 = &zeroin_addr; key2 = &laddr; } #endif rdomain = rtable_l2(rtable); mtx_enter(&table->inpt_mtx); inp = in_pcbhash_lookup(table, rdomain, &zeroin_addr, 0, key1, lport); if (inp == NULL && key1->s_addr != key2->s_addr) { inp = in_pcbhash_lookup(table, rdomain, &zeroin_addr, 0, key2, lport); } in_pcbref(inp); mtx_leave(&table->inpt_mtx); #ifdef DIAGNOSTIC if (inp == NULL && in_pcbnotifymiss) { printf("%s: laddr=%08x lport=%d rdom=%u\n", __func__, ntohl(laddr.s_addr), ntohs(lport), rdomain); } #endif return (inp); }
141 136 5 5 3 135 139 6 16 136 1 44 1 139 73 2 3 61 9 69 2 70 62 70 56 70 70 3 70 10 25 53 14 6 58 6 58 14 48 5 7 7 3 6 6 6 6 7 7 35 36 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 /* $OpenBSD: ffs_vnops.c,v 1.100 2022/06/26 05:20:43 visa Exp $ */ /* $NetBSD: ffs_vnops.c,v 1.7 1996/05/11 18:27:24 mycroft Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vnops.c 8.10 (Berkeley) 8/10/94 */ #include <sys/param.h> #include <sys/systm.h> #include <sys/resourcevar.h> #include <sys/kernel.h> #include <sys/stat.h> #include <sys/buf.h> #include <sys/mount.h> #include <sys/vnode.h> #include <sys/malloc.h> #include <sys/signalvar.h> #include <sys/pool.h> #include <sys/event.h> #include <sys/specdev.h> #include <miscfs/fifofs/fifo.h> #include <ufs/ufs/quota.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/dir.h> #include <ufs/ufs/ufs_extern.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ffs/fs.h> #include <ufs/ffs/ffs_extern.h> const struct vops ffs_vops = { .vop_lookup = ufs_lookup, .vop_create = ufs_create, .vop_mknod = ufs_mknod, .vop_open = ufs_open, .vop_close = ufs_close, .vop_access = ufs_access, .vop_getattr = ufs_getattr, .vop_setattr = ufs_setattr, .vop_read = ffs_read, .vop_write = ffs_write, .vop_ioctl = ufs_ioctl, .vop_kqfilter = ufs_kqfilter, .vop_revoke = vop_generic_revoke, .vop_fsync = ffs_fsync, .vop_remove = ufs_remove, .vop_link = ufs_link, .vop_rename = ufs_rename, .vop_mkdir = ufs_mkdir, .vop_rmdir = ufs_rmdir, .vop_symlink = ufs_symlink, .vop_readdir = ufs_readdir, .vop_readlink = ufs_readlink, .vop_abortop = vop_generic_abortop, .vop_inactive = ufs_inactive, .vop_reclaim = ffs_reclaim, .vop_lock = ufs_lock, .vop_unlock = ufs_unlock, .vop_bmap = ufs_bmap, .vop_strategy = ufs_strategy, .vop_print = ufs_print, .vop_islocked = ufs_islocked, .vop_pathconf = ufs_pathconf, .vop_advlock = ufs_advlock, .vop_bwrite = vop_generic_bwrite }; const struct vops ffs_specvops = { .vop_close = ufsspec_close, .vop_access = ufs_access, .vop_getattr = ufs_getattr, .vop_setattr = ufs_setattr, .vop_read = ufsspec_read, .vop_write = ufsspec_write, .vop_fsync = ffs_fsync, .vop_inactive = ufs_inactive, .vop_reclaim = ffs_reclaim, .vop_lock = ufs_lock, .vop_unlock = ufs_unlock, .vop_print = ufs_print, .vop_islocked = ufs_islocked, /* XXX: Keep in sync with spec_vops */ .vop_lookup = vop_generic_lookup, .vop_create = vop_generic_badop, .vop_mknod = vop_generic_badop, .vop_open = spec_open, .vop_ioctl = spec_ioctl, .vop_kqfilter = spec_kqfilter, .vop_revoke = vop_generic_revoke, .vop_remove = vop_generic_badop, .vop_link = vop_generic_badop, .vop_rename = vop_generic_badop, .vop_mkdir = vop_generic_badop, .vop_rmdir = vop_generic_badop, .vop_symlink = vop_generic_badop, .vop_readdir = vop_generic_badop, .vop_readlink = vop_generic_badop, .vop_abortop = vop_generic_badop, .vop_bmap = vop_generic_bmap, .vop_strategy = spec_strategy, .vop_pathconf = spec_pathconf, .vop_advlock = spec_advlock, .vop_bwrite = vop_generic_bwrite, }; #ifdef FIFO const struct vops ffs_fifovops = { .vop_close = ufsfifo_close, .vop_access = ufs_access, .vop_getattr = ufs_getattr, .vop_setattr = ufs_setattr, .vop_read = ufsfifo_read, .vop_write = ufsfifo_write, .vop_fsync = ffs_fsync, .vop_inactive = ufs_inactive, .vop_reclaim = ffsfifo_reclaim, .vop_lock = ufs_lock, .vop_unlock = ufs_unlock, .vop_print = ufs_print, .vop_islocked = ufs_islocked, .vop_bwrite = vop_generic_bwrite, /* XXX: Keep in sync with fifo_vops */ .vop_lookup = vop_generic_lookup, .vop_create = vop_generic_badop, .vop_mknod = vop_generic_badop, .vop_open = fifo_open, .vop_ioctl = fifo_ioctl, .vop_kqfilter = fifo_kqfilter, .vop_revoke = vop_generic_revoke, .vop_remove = vop_generic_badop, .vop_link = vop_generic_badop, .vop_rename = vop_generic_badop, .vop_mkdir = vop_generic_badop, .vop_rmdir = vop_generic_badop, .vop_symlink = vop_generic_badop, .vop_readdir = vop_generic_badop, .vop_readlink = vop_generic_badop, .vop_abortop = vop_generic_badop, .vop_bmap = vop_generic_bmap, .vop_strategy = vop_generic_badop, .vop_pathconf = fifo_pathconf, .vop_advlock = fifo_advlock }; #endif /* FIFO */ /* * Vnode op for reading. */ int ffs_read(void *v) { struct vop_read_args *ap = v; struct vnode *vp; struct inode *ip; struct uio *uio; struct fs *fs; struct buf *bp; daddr_t lbn, nextlbn; off_t bytesinfile; int size, xfersize, blkoffset; mode_t mode; int error; vp = ap->a_vp; ip = VTOI(vp); mode = DIP(ip, mode); uio = ap->a_uio; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("ffs_read: mode"); if (vp->v_type == VLNK) { if (DIP(ip, size) < ip->i_ump->um_maxsymlinklen || (ip->i_ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0)) panic("ffs_read: short symlink"); } else if (vp->v_type != VREG && vp->v_type != VDIR) panic("ffs_read: type %d", vp->v_type); #endif fs = ip->i_fs; if (uio->uio_offset < 0) return (EINVAL); if (uio->uio_resid == 0) return (0); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = DIP(ip, size) - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; size = fs->fs_bsize; /* WAS blksize(fs, ip, lbn); */ blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) >= DIP(ip, size)) error = bread(vp, lbn, size, &bp); else if (lbn - 1 == ip->i_ci.ci_lastr || uio->uio_resid > xfersize) { error = bread_cluster(vp, lbn, size, &bp); } else error = bread(vp, lbn, size, &bp); if (error) break; ip->i_ci.ci_lastr = lbn; /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } error = uiomove(bp->b_data + blkoffset, xfersize, uio); if (error) break; brelse(bp); } if (bp != NULL) brelse(bp); if (!(vp->v_mount->mnt_flag & MNT_NOATIME) || (ip->i_flag & (IN_CHANGE | IN_UPDATE))) { ip->i_flag |= IN_ACCESS; } return (error); } /* * Vnode op for writing. */ int ffs_write(void *v) { struct vop_write_args *ap = v; struct vnode *vp; struct uio *uio; struct inode *ip; struct fs *fs; struct buf *bp; daddr_t lbn; off_t osize; int blkoffset, error, extended, flags, ioflag, size, xfersize; size_t resid; ssize_t overrun; extended = 0; ioflag = ap->a_ioflag; uio = ap->a_uio; vp = ap->a_vp; ip = VTOI(vp); #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) panic("ffs_write: mode"); #endif /* * If writing 0 bytes, succeed and do not change * update time or file offset (standards compliance) */ if (uio->uio_resid == 0) return (0); switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = DIP(ip, size); if ((DIP(ip, flags) & APPEND) && uio->uio_offset != DIP(ip, size)) return (EPERM); /* FALLTHROUGH */ case VLNK: break; case VDIR: if ((ioflag & IO_SYNC) == 0) panic("ffs_write: nonsync dir write"); break; default: panic("ffs_write: type %d", vp->v_type); } fs = ip->i_fs; if (uio->uio_offset < 0 || (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) return (EFBIG); /* do the filesize rlimit check */ if ((error = vn_fsizechk(vp, uio, ioflag, &overrun))) return (error); resid = uio->uio_resid; osize = DIP(ip, size); flags = ioflag & IO_SYNC ? B_SYNC : 0; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (fs->fs_bsize > xfersize) flags |= B_CLRBUF; else flags &= ~B_CLRBUF; if ((error = UFS_BUF_ALLOC(ip, uio->uio_offset, xfersize, ap->a_cred, flags, &bp)) != 0) break; if (uio->uio_offset + xfersize > DIP(ip, size)) { DIP_ASSIGN(ip, size, uio->uio_offset + xfersize); uvm_vnp_setsize(vp, DIP(ip, size)); extended = 1; } (void)uvm_vnp_uncache(vp); size = blksize(fs, ip, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; error = uiomove(bp->b_data + blkoffset, xfersize, uio); /* * If the buffer is not already filled and we encounter an * error while trying to fill it, we have to clear out any * garbage data from the pages instantiated for the buffer. * If we do not, a failed uiomove() during a write can leave * the prior contents of the pages exposed to a userland mmap. * * Note that we don't need to clear buffers that were * allocated with the B_CLRBUF flag set. */ if (error != 0 && !(flags & B_CLRBUF)) memset(bp->b_data + blkoffset, 0, xfersize); if (ioflag & IO_NOCACHE) bp->b_flags |= B_NOCACHE; if (ioflag & IO_SYNC) (void)bwrite(bp); else if (xfersize + blkoffset == fs->fs_bsize) { bawrite(bp); } else bdwrite(bp); if (error || xfersize == 0) break; ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0 && !vnoperm(vp)) DIP_ASSIGN(ip, mode, DIP(ip, mode) & ~(ISUID | ISGID)); if (resid > uio->uio_resid) VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); if (error) { if (ioflag & IO_UNIT) { (void)UFS_TRUNCATE(ip, osize, ioflag & IO_SYNC, ap->a_cred); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) { error = UFS_UPDATE(ip, 1); } /* correct the result for writes clamped by vn_fsizechk() */ uio->uio_resid += overrun; return (error); } /* * Synch an open file. */ int ffs_fsync(void *v) { struct vop_fsync_args *ap = v; struct vnode *vp = ap->a_vp; struct buf *bp, *nbp; int s, error, passes, skipmeta; if (vp->v_type == VBLK && vp->v_specmountpoint != NULL && (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) softdep_fsync_mountdev(vp, ap->a_waitfor); /* * Flush all dirty buffers associated with a vnode. */ passes = NIADDR + 1; skipmeta = 0; if (ap->a_waitfor == MNT_WAIT) skipmeta = 1; s = splbio(); loop: LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { bp->b_flags &= ~B_SCANNED; } LIST_FOREACH_SAFE(bp, &vp->v_dirtyblkhd, b_vnbufs, nbp) { /* * Reasons to skip this buffer: it has already been considered * on this pass, this pass is the first time through on a * synchronous flush request and the buffer being considered * is metadata, the buffer has dependencies that will cause * it to be redirtied and it has not already been deferred, * or it is already being written. */ if (bp->b_flags & (B_BUSY | B_SCANNED)) continue; if ((bp->b_flags & B_DELWRI) == 0) panic("ffs_fsync: not dirty"); if (skipmeta && bp->b_lblkno < 0) continue; if (ap->a_waitfor != MNT_WAIT && LIST_FIRST(&bp->b_dep) != NULL && (bp->b_flags & B_DEFERRED) == 0 && buf_countdeps(bp, 0, 1)) { bp->b_flags |= B_DEFERRED; continue; } bremfree(bp); buf_acquire(bp); bp->b_flags |= B_SCANNED; splx(s); /* * On our final pass through, do all I/O synchronously * so that we can find out if our flush is failing * because of write errors. */ if (passes > 0 || ap->a_waitfor != MNT_WAIT) (void) bawrite(bp); else if ((error = bwrite(bp)) != 0) return (error); s = splbio(); /* * Since we may have slept during the I/O, we need * to start from a known point. */ nbp = LIST_FIRST(&vp->v_dirtyblkhd); } if (skipmeta) { skipmeta = 0; goto loop; } if (ap->a_waitfor == MNT_WAIT) { vwaitforio(vp, 0, "ffs_fsync", INFSLP); /* * Ensure that any filesystem metadata associated * with the vnode has been written. */ splx(s); if ((error = softdep_sync_metadata(ap)) != 0) return (error); s = splbio(); if (!LIST_EMPTY(&vp->v_dirtyblkhd)) { /* * Block devices associated with filesystems may * have new I/O requests posted for them even if * the vnode is locked, so no amount of trying will * get them clean. Thus we give block devices a * good effort, then just give up. For all other file * types, go around and try again until it is clean. */ if (passes > 0) { passes -= 1; goto loop; } #ifdef DIAGNOSTIC if (vp->v_type != VBLK) vprint("ffs_fsync: dirty", vp); #endif } } splx(s); return (UFS_UPDATE(VTOI(vp), ap->a_waitfor == MNT_WAIT)); } /* * Reclaim an inode so that it can be used for other purposes. */ int ffs_reclaim(void *v) { struct vop_reclaim_args *ap = v; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); int error; if ((error = ufs_reclaim(vp)) != 0) return (error); if (ip->i_din1 != NULL) { #ifdef FFS2 if (ip->i_ump->um_fstype == UM_UFS2) pool_put(&ffs_dinode2_pool, ip->i_din2); else #endif pool_put(&ffs_dinode1_pool, ip->i_din1); } pool_put(&ffs_ino_pool, ip); vp->v_data = NULL; return (0); } #ifdef FIFO int ffsfifo_reclaim(void *v) { fifo_reclaim(v); return (ffs_reclaim(v)); } #endif
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 24