7 6 6 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 /* $NetBSD: tmpfs.h,v 1.56 2020/05/17 19:39:15 ad Exp $ */ /* * Copyright (c) 2005, 2006, 2007, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, developed as part of Google's Summer of Code * 2005 program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _FS_TMPFS_TMPFS_H_ #define _FS_TMPFS_TMPFS_H_ #if !defined(_KERNEL) && !defined(_KMEMUSER) #error "not supposed to be exposed to userland" #endif #include <sys/dirent.h> #include <sys/mount.h> #include <sys/pool.h> #include <sys/queue.h> #include <sys/vnode.h> /* * Internal representation of a tmpfs directory entry. * * All fields are protected by vnode lock. */ typedef struct tmpfs_dirent { TAILQ_ENTRY(tmpfs_dirent) td_entries; /* Pointer to the inode this entry refers to. */ struct tmpfs_node * td_node; /* Sequence number, see tmpfs_dir_getseq(). */ uint32_t td_seq; /* Name and its length. */ char * td_name; uint16_t td_namelen; } tmpfs_dirent_t; TAILQ_HEAD(tmpfs_dir, tmpfs_dirent); /* * Internal representation of a tmpfs file system node -- inode. * * This structure is split in two parts: one holds attributes common * to all file types and the other holds data that is only applicable to * a particular type. * * All fields are protected by vnode lock. The vnode association itself * is protected by vcache. */ typedef struct tmpfs_node { LIST_ENTRY(tmpfs_node) tn_entries; /* * Each inode has a corresponding vnode. It is a bi-directional * association. Whenever vnode is allocated, its v_data field is * set to the inode it reference, and tmpfs_node_t::tn_vnode is * set to point to the said vnode. * * Further attempts to allocate a vnode for this same node will * result in returning a new reference to the value stored in * tn_vnode. It may be NULL when the node is unused (that is, * no vnode has been allocated or it has been reclaimed). */ vnode_t * tn_vnode; /* Prevent node from being reclaimed. */ uint32_t tn_holdcount; /* Directory entry. Only a hint, since hard link can have multiple. */ tmpfs_dirent_t * tn_dirent_hint; /* The inode type: VBLK, VCHR, VDIR, VFIFO, VLNK, VREG or VSOCK. */ enum vtype tn_type; /* Inode identifier and generation number. */ ino_t tn_id; uint32_t tn_gen; /* The inode size. */ off_t tn_size; /* Generic node attributes. */ uid_t tn_uid; gid_t tn_gid; mode_t tn_mode; int tn_flags; nlink_t tn_links; unsigned tn_tflags; struct timespec tn_atime; struct timespec tn_mtime; struct timespec tn_ctime; struct timespec tn_birthtime; kmutex_t tn_timelock; /* Head of byte-level lock list (used by tmpfs_advlock). */ struct lockf * tn_lockf; union { /* Type case: VBLK or VCHR. */ struct { dev_t tn_rdev; } tn_dev; /* Type case: VDIR. */ struct { /* Parent directory (root inode points to itself). */ struct tmpfs_node * tn_parent; /* List of directory entries. */ struct tmpfs_dir tn_dir; /* Last given sequence number and their arena. */ uint32_t tn_next_seq; void * tn_seq_arena; /* * Pointer of the last directory entry returned * by the readdir(3) operation. */ struct tmpfs_dirent * tn_readdir_lastp; } tn_dir; /* Type case: VLNK. */ struct tn_lnk { /* The link's target. */ char * tn_link; } tn_lnk; /* Type case: VREG. */ struct tn_reg { /* Underlying UVM object to store contents. */ struct uvm_object * tn_aobj; size_t tn_aobj_pages; } tn_reg; } tn_spec; } tmpfs_node_t; #if defined(_KERNEL) VFS_PROTOS(tmpfs); LIST_HEAD(tmpfs_node_list, tmpfs_node); #define TMPFS_MAXNAMLEN 255 /* Validate maximum td_namelen length. */ CTASSERT(TMPFS_MAXNAMLEN < UINT16_MAX); /* * Reserved values for the virtual entries (the first must be 0) and EOF. * The start/end of the incremental range, see tmpfs_dir_getseq(). */ #define TMPFS_DIRSEQ_DOT 0 #define TMPFS_DIRSEQ_DOTDOT 1 #define TMPFS_DIRSEQ_EOF 2 #define TMPFS_DIRSEQ_START 3 /* inclusive */ #define TMPFS_DIRSEQ_END (1U << 30) /* exclusive */ /* Mark to indicate that the number is not set. */ #define TMPFS_DIRSEQ_NONE (1U << 31) /* Flags: time update requests. */ #define TMPFS_UPDATE_ATIME 0x01 #define TMPFS_UPDATE_MTIME 0x02 #define TMPFS_UPDATE_CTIME 0x04 /* * Bits indicating whiteout use for the directory. * We abuse tmpfs_node_t::tn_gen for that. */ #define TMPFS_WHITEOUT_BIT (1U << 31) #define TMPFS_NODE_GEN_MASK (TMPFS_WHITEOUT_BIT - 1) #define TMPFS_NODE_GEN(node) \ ((node)->tn_gen & TMPFS_NODE_GEN_MASK) /* White-out inode indicator. */ #define TMPFS_NODE_WHITEOUT ((tmpfs_node_t *)-1) /* * Bit indicating this node must be reclaimed when holdcount reaches zero. * Ored into tmpfs_node_t::tn_holdcount. */ #define TMPFS_NODE_RECLAIMED (1U << 30) /* * Internal representation of a tmpfs mount point. */ typedef struct tmpfs_mount { /* Limit and number of bytes in use by the file system. */ uint64_t tm_mem_limit; uint64_t tm_bytes_used; kmutex_t tm_acc_lock; /* Pointer to the root inode. */ tmpfs_node_t * tm_root; /* Maximum number of possible nodes for this file system. */ unsigned int tm_nodes_max; /* Number of nodes currently allocated. */ unsigned int tm_nodes_cnt; /* List of inodes and the lock protecting it. */ kmutex_t tm_lock; struct tmpfs_node_list tm_nodes; } tmpfs_mount_t; /* * This structure maps a file identifier to a tmpfs node. Used by the * NFS code. */ typedef struct tmpfs_fid { uint16_t tf_len; uint16_t tf_pad; uint32_t tf_gen; ino_t tf_id; } tmpfs_fid_t; /* * Prototypes for tmpfs_subr.c. */ void tmpfs_free_node(tmpfs_mount_t *, tmpfs_node_t *); int tmpfs_construct_node(vnode_t *, vnode_t **, struct vattr *, struct componentname *, char *); int tmpfs_alloc_dirent(tmpfs_mount_t *, const char *, uint16_t, tmpfs_dirent_t **); void tmpfs_free_dirent(tmpfs_mount_t *, tmpfs_dirent_t *); void tmpfs_dir_attach(tmpfs_node_t *, tmpfs_dirent_t *, tmpfs_node_t *); void tmpfs_dir_detach(tmpfs_node_t *, tmpfs_dirent_t *); tmpfs_dirent_t *tmpfs_dir_lookup(tmpfs_node_t *, struct componentname *); tmpfs_dirent_t *tmpfs_dir_cached(tmpfs_node_t *); uint32_t tmpfs_dir_getseq(tmpfs_node_t *, tmpfs_dirent_t *); tmpfs_dirent_t *tmpfs_dir_lookupbyseq(tmpfs_node_t *, off_t); int tmpfs_dir_getdents(tmpfs_node_t *, struct uio *, off_t *); int tmpfs_reg_resize(vnode_t *, off_t); int tmpfs_chflags(vnode_t *, int, kauth_cred_t, lwp_t *); int tmpfs_chmod(vnode_t *, mode_t, kauth_cred_t, lwp_t *); int tmpfs_chown(vnode_t *, uid_t, gid_t, kauth_cred_t, lwp_t *); int tmpfs_chsize(vnode_t *, u_quad_t, kauth_cred_t, lwp_t *); int tmpfs_chtimes(vnode_t *, const struct timespec *, const struct timespec *, const struct timespec *, int, kauth_cred_t, lwp_t *); void tmpfs_update(vnode_t *, unsigned); void tmpfs_update_locked(vnode_t *, unsigned); void tmpfs_update_lazily(vnode_t *, unsigned); /* * Prototypes for tmpfs_mem.c. */ void tmpfs_mntmem_init(tmpfs_mount_t *, uint64_t); void tmpfs_mntmem_destroy(tmpfs_mount_t *); int tmpfs_mntmem_set(tmpfs_mount_t *, uint64_t); size_t tmpfs_mem_info(bool); uint64_t tmpfs_bytes_max(tmpfs_mount_t *); size_t tmpfs_pages_avail(tmpfs_mount_t *); bool tmpfs_mem_incr(tmpfs_mount_t *, size_t); void tmpfs_mem_decr(tmpfs_mount_t *, size_t); tmpfs_dirent_t *tmpfs_dirent_get(tmpfs_mount_t *); void tmpfs_dirent_put(tmpfs_mount_t *, tmpfs_dirent_t *); tmpfs_node_t * tmpfs_node_get(tmpfs_mount_t *); void tmpfs_node_put(tmpfs_mount_t *, tmpfs_node_t *); char * tmpfs_strname_alloc(tmpfs_mount_t *, size_t); void tmpfs_strname_free(tmpfs_mount_t *, char *, size_t); bool tmpfs_strname_neqlen(struct componentname *, struct componentname *); /* * Ensures that the node pointed by 'node' is a directory and that its * contents are consistent with respect to directories. */ #define TMPFS_VALIDATE_DIR(node) \ KASSERT((node)->tn_vnode == NULL || VOP_ISLOCKED((node)->tn_vnode)); \ KASSERT((node)->tn_type == VDIR); \ KASSERT((node)->tn_size % sizeof(tmpfs_dirent_t) == 0); /* * Routines to convert VFS structures to tmpfs internal ones. */ static __inline tmpfs_mount_t * VFS_TO_TMPFS(struct mount *mp) { tmpfs_mount_t *tmp = mp->mnt_data; KASSERT(tmp != NULL); return tmp; } static __inline tmpfs_node_t * VP_TO_TMPFS_DIR(vnode_t *vp) { tmpfs_node_t *node = vp->v_data; KASSERT(node != NULL); TMPFS_VALIDATE_DIR(node); return node; } #endif /* defined(_KERNEL) */ static __inline tmpfs_node_t * VP_TO_TMPFS_NODE(vnode_t *vp) { tmpfs_node_t *node = vp->v_data; #ifdef KASSERT KASSERT(node != NULL); #endif return node; } #endif /* _FS_TMPFS_TMPFS_H_ */
190 142 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 /* $NetBSD: subr_debug.c,v 1.7 2008/04/30 20:20:53 ad Exp $ */ /*- * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Shared support code for kernels built with the DEBUG option. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_debug.c,v 1.7 2008/04/30 20:20:53 ad Exp $"); #include "opt_ddb.h" #include <sys/param.h> #include <sys/proc.h> #include <sys/systm.h> #include <sys/kmem.h> #include <sys/debug.h> #include <sys/atomic.h> #include <sys/cpu.h> #include <uvm/uvm_extern.h> #include <machine/lock.h> /* * Allocation/free validation by pointer address. Introduces * significant overhead and is not enabled by default. Patch * `debug_freecheck' to 1 at boot time to enable. */ #define FREECHECK_BYTES (8*1024*1024) typedef struct fcitem { void *i_addr; struct fcitem *i_next; } fcitem_t; fcitem_t *freecheck_free; __cpu_simple_lock_t freecheck_lock; u_int debug_freecheck; void debug_init(void) { size_t cnt; fcitem_t *i; __cpu_simple_lock_init(&freecheck_lock); if (debug_freecheck) { i = (fcitem_t *)uvm_km_alloc(kernel_map, FREECHECK_BYTES, 0, UVM_KMF_WIRED); if (i == NULL) { printf("freecheck_init: unable to allocate memory"); return; } for (cnt = FREECHECK_BYTES / sizeof(*i); cnt != 0; cnt--) { i->i_next = freecheck_free; freecheck_free = i++; } } } void freecheck_out(void **head, void *addr) { fcitem_t *i; int s; if (!debug_freecheck) return; s = splvm(); __cpu_simple_lock(&freecheck_lock); for (i = *head; i != NULL; i = i->i_next) { if (i->i_addr != addr) continue; __cpu_simple_unlock(&freecheck_lock); splx(s); panic("freecheck_out: %p already out", addr); } if ((i = freecheck_free) != NULL) { freecheck_free = i->i_next; i->i_addr = addr; i->i_next = *head; *head = i; } __cpu_simple_unlock(&freecheck_lock); splx(s); if (i == NULL) { if (atomic_swap_uint(&debug_freecheck, 1) == 0) printf("freecheck_out: no more slots\n"); } } void freecheck_in(void **head, void *addr) { fcitem_t *i; void *pp; int s; if (!debug_freecheck) return; s = splvm(); __cpu_simple_lock(&freecheck_lock); for (i = *head, pp = head; i != NULL; pp = &i->i_next, i = i->i_next) { if (i->i_addr == addr) { *(fcitem_t **)pp = i->i_next; i->i_next = freecheck_free; freecheck_free = i; break; } } __cpu_simple_unlock(&freecheck_lock); splx(s); if (i != NULL) return; #ifdef DDB printf("freecheck_in: %p not out\n", addr); Debugger(); #else panic("freecheck_in: %p not out", addr); #endif }
2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 /* $NetBSD: genfs_rename.c,v 1.7 2021/10/20 13:29:06 thorpej Exp $ */ /*- * Copyright (c) 2012 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Generic rename abstraction. * * Rename is unbelievably hairy. Try to use this if you can -- * otherwise you are practically guaranteed to get it wrong. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: genfs_rename.c,v 1.7 2021/10/20 13:29:06 thorpej Exp $"); #include <sys/param.h> #include <sys/kauth.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/stat.h> #include <sys/vnode.h> #include <sys/types.h> #include <miscfs/genfs/genfs.h> /* * Sample copypasta for implementing VOP_RENAME via genfs_rename. * Don't change this template without carefully considering whether * every other file system that already uses it needs to change too. * That way, once we have changed all the file systems to use it, we * can easily replace mumblefs_rename by mumblefs_sane_rename and * eliminate the insane API altogether. */ /* begin sample copypasta */ #if 0 static const struct genfs_rename_ops mumblefs_genfs_rename_ops; /* * mumblefs_sane_rename: The hairiest vop, with the saner API. * * Arguments: * * . fdvp (from directory vnode), * . fcnp (from component name), * . tdvp (to directory vnode), * . tcnp (to component name), * . cred (credentials structure), and * . posixly_correct (flag for behaviour if target & source link same file). * * fdvp and tdvp may be the same, and must be referenced and unlocked. */ static int mumblefs_sane_rename( struct vnode *fdvp, struct componentname *fcnp, struct vnode *tdvp, struct componentname *tcnp, kauth_cred_t cred, bool posixly_correct) { struct mumblefs_lookup_results fulr, tulr; return genfs_sane_rename(&mumblefs_genfs_rename_ops, fdvp, fcnp, &fulr, tdvp, tcnp, &tulr, cred, posixly_correct); } /* * mumblefs_rename: The hairiest vop, with the insanest API. Defer to * genfs_insane_rename immediately. */ int mumblefs_rename(void *v) { return genfs_insane_rename(v, &mumblefs_sane_rename); } #endif /* end sample copypasta */ /* * Forward declarations */ static int genfs_rename_enter(const struct genfs_rename_ops *, struct mount *, kauth_cred_t, struct vnode *, struct componentname *, void *, struct vnode **, struct vnode *, struct componentname *, void *, struct vnode **); static int genfs_rename_enter_common(const struct genfs_rename_ops *, struct mount *, kauth_cred_t, struct vnode *, struct componentname *, void *, struct vnode **, struct componentname *, void *, struct vnode **); static int genfs_rename_enter_separate(const struct genfs_rename_ops *, struct mount *, kauth_cred_t, struct vnode *, struct componentname *, void *, struct vnode **, struct vnode *, struct componentname *, void *, struct vnode **); static int genfs_rename_lock(const struct genfs_rename_ops *, struct mount *, kauth_cred_t, int, int, int, struct vnode *, struct componentname *, bool, void *, struct vnode **, struct vnode *, struct componentname *, bool, void *, struct vnode **); static void genfs_rename_exit(const struct genfs_rename_ops *, struct mount *, struct vnode *, struct vnode *, struct vnode *, struct vnode *); static int genfs_rename_remove(const struct genfs_rename_ops *, struct mount *, kauth_cred_t, struct vnode *, struct componentname *, void *, struct vnode *, nlink_t *); /* * genfs_insane_rename: Generic implementation of the insane API for * the rename vop. * * Arguments: * * . fdvp (from directory vnode), * . fvp (from vnode), * . fcnp (from component name), * . tdvp (to directory vnode), * . tvp (to vnode, or NULL), and * . tcnp (to component name). * * Any pair of vnode parameters may have the same vnode. * * On entry, * * . fdvp, fvp, tdvp, and tvp are referenced, * . fdvp and fvp are unlocked, and * . tdvp and tvp (if nonnull) are locked. * * On exit, * * . fdvp, fvp, tdvp, and tvp (if nonnull) are unreferenced, and * . tdvp and tvp (if nonnull) are unlocked. */ int genfs_insane_rename(void *v, int (*sane_rename)(struct vnode *fdvp, struct componentname *fcnp, struct vnode *tdvp, struct componentname *tcnp, kauth_cred_t cred, bool posixly_correct)) { struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap = v; struct vnode *fdvp = ap->a_fdvp; struct vnode *fvp = ap->a_fvp; struct componentname *fcnp = ap->a_fcnp; struct vnode *tdvp = ap->a_tdvp; struct vnode *tvp = ap->a_tvp; struct componentname *tcnp = ap->a_tcnp; kauth_cred_t cred; int error; KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(fcnp != NULL); KASSERT(fcnp->cn_nameptr != NULL); KASSERT(tdvp != NULL); KASSERT(tcnp != NULL); KASSERT(fcnp->cn_nameptr != NULL); /* KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */ /* KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */ KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); cred = fcnp->cn_cred; /* * XXX Want a better equality test. `tcnp->cn_cred == cred' * hoses p2k because puffs transmits the creds separately and * allocates distinct but equivalent structures for them. */ KASSERT(kauth_cred_uidmatch(cred, tcnp->cn_cred)); /* * Sanitize our world from the VFS insanity. Unlock the target * directory and node, which are locked. Release the children, * which are referenced, since we'll be looking them up again * later. */ VOP_UNLOCK(tdvp); if ((tvp != NULL) && (tvp != tdvp)) VOP_UNLOCK(tvp); vrele(fvp); if (tvp != NULL) vrele(tvp); error = (*sane_rename)(fdvp, fcnp, tdvp, tcnp, cred, false); /* * All done, whether with success or failure. Release the * directory nodes now, as the caller expects from the VFS * protocol. */ vrele(fdvp); vrele(tdvp); return error; } /* * genfs_sane_rename: Generic implementation of the saner API for the * rename vop. Handles ancestry checks, locking, and permissions * checks. Caller is responsible for implementing the genfs rename * operations. * * fdvp and tdvp must be referenced and unlocked. */ int genfs_sane_rename(const struct genfs_rename_ops *ops, struct vnode *fdvp, struct componentname *fcnp, void *fde, struct vnode *tdvp, struct componentname *tcnp, void *tde, kauth_cred_t cred, bool posixly_correct) { struct mount *mp; struct vnode *fvp = NULL, *tvp = NULL; nlink_t tvp_new_nlink = 0; int error; KASSERT(ops != NULL); KASSERT(fdvp != NULL); KASSERT(fcnp != NULL); KASSERT(tdvp != NULL); KASSERT(tcnp != NULL); /* KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */ /* KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */ KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == tdvp->v_mount); KASSERT(fcnp != tcnp); KASSERT(fcnp->cn_nameiop == DELETE); KASSERT(tcnp->cn_nameiop == RENAME); /* XXX Want a better equality test. */ KASSERT(kauth_cred_uidmatch(cred, fcnp->cn_cred)); KASSERT(kauth_cred_uidmatch(cred, tcnp->cn_cred)); mp = fdvp->v_mount; KASSERT(mp != NULL); KASSERT(mp == tdvp->v_mount); /* XXX How can we be sure this stays true? */ KASSERT((mp->mnt_flag & MNT_RDONLY) == 0); /* Reject rename("x/..", ...) and rename(..., "x/..") early. */ if ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) return EINVAL; /* XXX EISDIR? */ error = genfs_rename_enter(ops, mp, cred, fdvp, fcnp, fde, &fvp, tdvp, tcnp, tde, &tvp); if (error) return error; /* * Check that everything is locked and looks right. */ KASSERT(fvp != NULL); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); /* * If the source and destination are the same object, we need * only at most delete the source entry. We are guaranteed at * this point that the entries are distinct. */ if (fvp == tvp) { KASSERT(tvp != NULL); if (fvp->v_type == VDIR) /* XXX This shouldn't be possible. */ error = EINVAL; else if (posixly_correct) /* POSIX sez to leave them alone. */ error = 0; else if ((fdvp == tdvp) && (fcnp->cn_namelen == tcnp->cn_namelen) && (memcmp(fcnp->cn_nameptr, tcnp->cn_nameptr, fcnp->cn_namelen) == 0)) /* Renaming an entry over itself does nothing. */ error = 0; else { /* XXX Can't use VOP_REMOVE because of locking. */ error = genfs_rename_remove(ops, mp, cred, fdvp, fcnp, fde, fvp, &tvp_new_nlink); VN_KNOTE(fdvp, NOTE_WRITE); VN_KNOTE(fvp, tvp_new_nlink == 0 ? NOTE_DELETE : NOTE_LINK); } goto out; } KASSERT(fvp != tvp); KASSERT((fdvp != tdvp) || (fcnp->cn_namelen != tcnp->cn_namelen) || (memcmp(fcnp->cn_nameptr, tcnp->cn_nameptr, fcnp->cn_namelen) != 0)); /* * If the target exists, refuse to rename a directory over a * non-directory or vice versa, or to clobber a non-empty * directory. */ if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type == VDIR) error = (ops->gro_directory_empty_p(mp, cred, tvp, tdvp)? 0 : ENOTEMPTY); else if (fvp->v_type == VDIR && tvp->v_type != VDIR) error = ENOTDIR; else if (fvp->v_type != VDIR && tvp->v_type == VDIR) error = EISDIR; else error = 0; if (error) goto out; KASSERT((fvp->v_type == VDIR) == (tvp->v_type == VDIR)); } /* * Authorize the rename. */ error = ops->gro_rename_check_possible(mp, fdvp, fvp, tdvp, tvp); if (error) goto out; error = ops->gro_rename_check_permitted(mp, cred, fdvp, fvp, tdvp, tvp); error = kauth_authorize_vnode(cred, KAUTH_VNODE_DELETE, fvp, fdvp, error); error = kauth_authorize_vnode(cred, KAUTH_VNODE_RENAME, tvp, tdvp, error); if (error) goto out; /* * Everything is hunky-dory. Shuffle the directory entries. */ error = ops->gro_rename(mp, cred, fdvp, fcnp, fde, fvp, tdvp, tcnp, tde, tvp, &tvp_new_nlink); if (error) goto out; /* Success! */ genfs_rename_knote(fdvp, fvp, tdvp, tvp, tvp_new_nlink); out: genfs_rename_exit(ops, mp, fdvp, fvp, tdvp, tvp); return error; } /* * genfs_rename_knote: Note events about the various vnodes in a * rename. To be called by gro_rename on success. The only pair of * vnodes that may be identical is {fdvp, tdvp}. tvp_new_nlink is * the resulting link count of tvp. */ void genfs_rename_knote(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, nlink_t tvp_new_nlink) { long fdvp_events, tdvp_events; bool directory_p, reparent_p, replaced_p; KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); directory_p = (fvp->v_type == VDIR); reparent_p = (fdvp != tdvp); replaced_p = (tvp != NULL); KASSERT((tvp == NULL) || (directory_p == (tvp->v_type == VDIR))); fdvp_events = NOTE_WRITE; if (directory_p && reparent_p) fdvp_events |= NOTE_LINK; VN_KNOTE(fdvp, fdvp_events); VN_KNOTE(fvp, NOTE_RENAME); if (reparent_p) { tdvp_events = NOTE_WRITE; if (!replaced_p) { tdvp_events |= NOTE_EXTEND; if (directory_p) tdvp_events |= NOTE_LINK; } VN_KNOTE(tdvp, tdvp_events); } if (replaced_p) VN_KNOTE(tvp, (tvp_new_nlink == 0 ? NOTE_DELETE : NOTE_LINK)); } /* * genfs_rename_cache_purge: Purge the name cache. To be called by * gro_rename on success. The only pair of vnodes that may be * identical is {fdvp, tdvp}. */ void genfs_rename_cache_purge(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp) { KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); /* * XXX What actually needs to be purged? */ cache_purge(fdvp); if (fvp->v_type == VDIR) cache_purge(fvp); if (tdvp != fdvp) cache_purge(tdvp); if ((tvp != NULL) && (tvp->v_type == VDIR)) cache_purge(tvp); } /* * genfs_rename_enter: Look up fcnp in fdvp, and store the lookup * results in *fde_ret and the associated vnode in *fvp_ret; fail if * not found. Look up tcnp in tdvp, and store the lookup results in * *tde_ret and the associated vnode in *tvp_ret; store null instead if * not found. Fail if anything has been mounted on any of the nodes * involved. * * fdvp and tdvp must be referenced. * * On entry, nothing is locked. * * On success, everything is locked, and *fvp_ret, and *tvp_ret if * nonnull, are referenced. The only pairs of vnodes that may be * identical are {fdvp, tdvp} and {fvp, tvp}. * * On failure, everything remains as was. * * Locking everything including the source and target nodes is * necessary to make sure that, e.g., link count updates are OK. The * locking order is, in general, ancestor-first, matching the order you * need to use to look up a descendant anyway. */ static int genfs_rename_enter(const struct genfs_rename_ops *ops, struct mount *mp, kauth_cred_t cred, struct vnode *fdvp, struct componentname *fcnp, void *fde_ret, struct vnode **fvp_ret, struct vnode *tdvp, struct componentname *tcnp, void *tde_ret, struct vnode **tvp_ret) { int error; KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fcnp != NULL); KASSERT(fvp_ret != NULL); KASSERT(tdvp != NULL); KASSERT(tcnp != NULL); KASSERT(tvp_ret != NULL); KASSERT(fvp_ret != tvp_ret); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); if (fdvp == tdvp) error = genfs_rename_enter_common(ops, mp, cred, fdvp, fcnp, fde_ret, fvp_ret, tcnp, tde_ret, tvp_ret); else error = genfs_rename_enter_separate(ops, mp, cred, fdvp, fcnp, fde_ret, fvp_ret, tdvp, tcnp, tde_ret, tvp_ret); if (error) return error; KASSERT(*fvp_ret != NULL); KASSERT(VOP_ISLOCKED(*fvp_ret) == LK_EXCLUSIVE); KASSERT((*tvp_ret == NULL) || (VOP_ISLOCKED(*tvp_ret) == LK_EXCLUSIVE)); KASSERT(*fvp_ret != fdvp); KASSERT(*fvp_ret != tdvp); KASSERT(*tvp_ret != fdvp); KASSERT(*tvp_ret != tdvp); return 0; } /* * genfs_rename_enter_common: Lock and look up with a common * source/target directory. */ static int genfs_rename_enter_common(const struct genfs_rename_ops *ops, struct mount *mp, kauth_cred_t cred, struct vnode *dvp, struct componentname *fcnp, void *fde_ret, struct vnode **fvp_ret, struct componentname *tcnp, void *tde_ret, struct vnode **tvp_ret) { struct vnode *fvp, *tvp; int error; KASSERT(ops != NULL); KASSERT(mp != NULL); KASSERT(dvp != NULL); KASSERT(fcnp != NULL); KASSERT(fvp_ret != NULL); KASSERT(tcnp != NULL); KASSERT(tvp_ret != NULL); KASSERT(dvp->v_type == VDIR); KASSERT(dvp->v_mount == mp); error = ops->gro_lock_directory(mp, dvp); if (error) goto fail0; /* Did we lose a race with mount? */ if (dvp->v_mountedhere != NULL) { error = EBUSY; goto fail1; } KASSERT(fcnp->cn_nameiop == DELETE); error = ops->gro_lookup(mp, dvp, fcnp, fde_ret, &fvp); if (error) goto fail1; KASSERT(fvp != NULL); /* Refuse to rename `.'. */ if (fvp == dvp) { error = EINVAL; goto fail2; } KASSERT(fvp != dvp); KASSERT(tcnp->cn_nameiop == RENAME); error = ops->gro_lookup(mp, dvp, tcnp, tde_ret, &tvp); if (error == ENOENT) { tvp = NULL; } else if (error) { goto fail2; } else { KASSERT(tvp != NULL); /* Refuse to rename over `.'. */ if (tvp == dvp) { error = EISDIR; /* XXX EINVAL? */ goto fail2; } } KASSERT(tvp != dvp); /* * We've looked up both nodes. Now lock them and check them. */ vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY); KASSERT(fvp->v_mount == mp); /* Refuse to rename a mount point. */ if ((fvp->v_type == VDIR) && (fvp->v_mountedhere != NULL)) { error = EBUSY; goto fail3; } if ((tvp != NULL) && (tvp != fvp)) { vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY); KASSERT(tvp->v_mount == mp); /* Refuse to rename over a mount point. */ if ((tvp->v_type == VDIR) && (tvp->v_mountedhere != NULL)) { error = EBUSY; goto fail4; } } KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); *fvp_ret = fvp; *tvp_ret = tvp; return 0; fail4: if ((tvp != NULL) && (tvp != fvp)) VOP_UNLOCK(tvp); fail3: VOP_UNLOCK(fvp); if (tvp != NULL) vrele(tvp); fail2: vrele(fvp); fail1: VOP_UNLOCK(dvp); fail0: return error; } /* * genfs_rename_enter_separate: Lock and look up with separate source * and target directories. */ static int genfs_rename_enter_separate(const struct genfs_rename_ops *ops, struct mount *mp, kauth_cred_t cred, struct vnode *fdvp, struct componentname *fcnp, void *fde_ret, struct vnode **fvp_ret, struct vnode *tdvp, struct componentname *tcnp, void *tde_ret, struct vnode **tvp_ret) { struct vnode *intermediate_node; struct vnode *fvp, *tvp; int error; KASSERT(ops != NULL); KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fcnp != NULL); KASSERT(fvp_ret != NULL); KASSERT(tdvp != NULL); KASSERT(tcnp != NULL); KASSERT(tvp_ret != NULL); KASSERT(fdvp != tdvp); KASSERT(fcnp != tcnp); KASSERT(fcnp->cn_nameiop == DELETE); KASSERT(tcnp->cn_nameiop == RENAME); KASSERT(fvp_ret != tvp_ret); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); error = ops->gro_genealogy(mp, cred, fdvp, tdvp, &intermediate_node); if (error) return error; /* * intermediate_node == NULL means fdvp is not an ancestor of tdvp. */ if (intermediate_node == NULL) error = genfs_rename_lock(ops, mp, cred, ENOTEMPTY, EISDIR, EINVAL, tdvp, tcnp, true, tde_ret, &tvp, fdvp, fcnp, false, fde_ret, &fvp); else error = genfs_rename_lock(ops, mp, cred, EINVAL, EISDIR, EINVAL, fdvp, fcnp, false, fde_ret, &fvp, tdvp, tcnp, true, tde_ret, &tvp); if (error) goto out; KASSERT(fvp != NULL); /* * Reject rename("foo/bar", "foo/bar/baz/quux/zot"). */ if (fvp == intermediate_node) { genfs_rename_exit(ops, mp, fdvp, fvp, tdvp, tvp); error = EINVAL; goto out; } *fvp_ret = fvp; *tvp_ret = tvp; error = 0; out: if (intermediate_node != NULL) vrele(intermediate_node); return error; } /* * genfs_rename_lock: Lookup and lock it all. The lock order is: * * a_dvp -> a_vp -> b_dvp -> b_vp, * * except if a_vp is a nondirectory in which case the lock order is: * * a_dvp -> b_dvp -> b_vp -> a_vp, * * which can't violate ancestor->descendant because a_vp has no * descendants in this case. This edge case is necessary because some * file systems can only lookup/lock/unlock, and we can't hold a_vp * locked when we lookup/lock/unlock b_vp if they turn out to be the * same, and we can't find out that they're the same until after the * lookup. * * b_dvp must not be an ancestor of a_dvp, although a_dvp may be an * ancestor of b_dvp. * * Fail with overlap_error if node a is directory b. Neither * componentname may be `.' or `..'. * * a_dvp and b_dvp must be referenced. * * On entry, a_dvp and b_dvp are unlocked. * * On success, * . a_dvp and b_dvp are locked, * . *a_dirent_ret is filled with a directory entry whose node is * locked and referenced, * . *b_vp_ret is filled with the corresponding vnode, * . *b_dirent_ret is filled either with null or with a directory entry * whose node is locked and referenced, * . *b_vp is filled either with null or with the corresponding vnode, * and * . the only pair of vnodes that may be identical is a_vp and b_vp. * * On failure, a_dvp and b_dvp are left unlocked, and *a_dirent_ret, * *a_vp, *b_dirent_ret, and *b_vp are left alone. */ static int genfs_rename_lock(const struct genfs_rename_ops *ops, struct mount *mp, kauth_cred_t cred, int overlap_error, int a_dot_error, int b_dot_error, struct vnode *a_dvp, struct componentname *a_cnp, bool a_missing_ok, void *a_de_ret, struct vnode **a_vp_ret, struct vnode *b_dvp, struct componentname *b_cnp, bool b_missing_ok, void *b_de_ret, struct vnode **b_vp_ret) { struct vnode *a_vp, *b_vp; int error; KASSERT(ops != NULL); KASSERT(mp != NULL); KASSERT(a_dvp != NULL); KASSERT(a_cnp != NULL); KASSERT(a_vp_ret != NULL); KASSERT(b_dvp != NULL); KASSERT(b_cnp != NULL); KASSERT(b_vp_ret != NULL); KASSERT(a_dvp != b_dvp); KASSERT(a_vp_ret != b_vp_ret); KASSERT(a_dvp->v_type == VDIR); KASSERT(b_dvp->v_type == VDIR); KASSERT(a_dvp->v_mount == mp); KASSERT(b_dvp->v_mount == mp); KASSERT(a_missing_ok != b_missing_ok); /* * 1. Lock a_dvp. */ error = ops->gro_lock_directory(mp, a_dvp); if (error) goto fail0; /* Did we lose a race with mount? */ if (a_dvp->v_mountedhere != NULL) { error = EBUSY; goto fail1; } /* * 2. Lookup a_vp. May lock/unlock a_vp. */ error = ops->gro_lookup(mp, a_dvp, a_cnp, a_de_ret, &a_vp); if (error) { if (a_missing_ok && (error == ENOENT)) a_vp = NULL; else goto fail1; } else { KASSERT(a_vp != NULL); /* Refuse to rename (over) `.'. */ if (a_vp == a_dvp) { error = a_dot_error; goto fail2; } /* Reject rename("x", "x/y") or rename("x/y", "x"). */ if (a_vp == b_dvp) { error = overlap_error; goto fail2; } } KASSERT(a_vp != a_dvp); KASSERT(a_vp != b_dvp); /* * 3. Lock a_vp, if it is a directory. * * We already ruled out a_vp == a_dvp (i.e., a_cnp is `.'), so * this is not locking against self, and we already ruled out * a_vp == b_dvp, so this won't cause subsequent locking of * b_dvp to lock against self. * * If a_vp is a nondirectory, we can't hold it when we lookup * b_vp in case (a) the file system can only lookup/lock/unlock * and (b) b_vp turns out to be the same file as a_vp due to * hard links -- and we can't even detect that case until after * we've looked up b_vp. Fortunately, if a_vp is a * nondirectory, then it is a leaf, so we can safely lock it * last. */ if (a_vp != NULL && a_vp->v_type == VDIR) { vn_lock(a_vp, LK_EXCLUSIVE | LK_RETRY); KASSERT(a_vp->v_mount == mp); /* Refuse to rename (over) a mount point. */ if (a_vp->v_mountedhere != NULL) { error = EBUSY; goto fail3; } } /* * 4. Lock b_dvp. */ error = ops->gro_lock_directory(mp, b_dvp); if (error) goto fail3; /* Did we lose a race with mount? */ if (b_dvp->v_mountedhere != NULL) { error = EBUSY; goto fail4; } /* * 5. Lookup b_vp. May lock/unlock b_vp. */ error = ops->gro_lookup(mp, b_dvp, b_cnp, b_de_ret, &b_vp); if (error) { if (b_missing_ok && (error == ENOENT)) b_vp = NULL; else goto fail4; } else { KASSERT(b_vp != NULL); /* Refuse to rename (over) `.'. */ if (b_vp == b_dvp) { error = b_dot_error; goto fail5; } /* * b_dvp must not be an ancestor of a_dvp, so if we * find b_dvp/b_vp=a_dvp/a_vp something is wrong. */ if (b_vp == a_dvp) { /* * We have a directory hard link before us. * XXX What error should this return? EDEADLK? * Panic? */ error = EIO; goto fail5; } } KASSERT(b_vp != b_dvp); KASSERT(b_vp != a_dvp); /* * 6. Lock a_vp, if it is a nondirectory. * * In this case a_vp is a leaf, so it is either equal to or * incommensurate with b_vp, and so we can safely lock it at * any point now. */ if (a_vp != NULL && a_vp->v_type != VDIR) { vn_lock(a_vp, LK_EXCLUSIVE | LK_RETRY); KASSERT(a_vp->v_mount == mp); /* (not a directory so can't have anything mounted here) */ } /* * 7. Lock b_vp, if it is not a_vp. * * b_vp and a_vp may the same inode if they are hard links to * one another. */ if ((b_vp != NULL) && (b_vp != a_vp)) { vn_lock(b_vp, LK_EXCLUSIVE | LK_RETRY); KASSERT(b_vp->v_mount == mp); /* Refuse to rename (over) a mount point. */ if ((b_vp->v_type == VDIR) && (b_vp->v_mountedhere != NULL)) { error = EBUSY; goto fail6; } } KASSERT(VOP_ISLOCKED(a_dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(b_dvp) == LK_EXCLUSIVE); KASSERT(a_missing_ok || (a_vp != NULL)); KASSERT(b_missing_ok || (b_vp != NULL)); KASSERT((a_vp == NULL) || (VOP_ISLOCKED(a_vp) == LK_EXCLUSIVE)); KASSERT((b_vp == NULL) || (VOP_ISLOCKED(b_vp) == LK_EXCLUSIVE)); *a_vp_ret = a_vp; *b_vp_ret = b_vp; return 0; fail6: if ((b_vp != NULL) && (b_vp != a_vp)) VOP_UNLOCK(b_vp); if (a_vp != NULL && a_vp->v_type != VDIR) VOP_UNLOCK(a_vp); fail5: if (b_vp != NULL) vrele(b_vp); fail4: VOP_UNLOCK(b_dvp); fail3: if (a_vp != NULL && a_vp->v_type == VDIR) VOP_UNLOCK(a_vp); fail2: if (a_vp != NULL) vrele(a_vp); fail1: VOP_UNLOCK(a_dvp); fail0: return error; } /* * genfs_rename_exit: Unlock everything we locked for rename. * * fdvp and tdvp must be referenced. * * On entry, everything is locked, and fvp and tvp referenced. * * On exit, everything is unlocked, and fvp and tvp are released. */ static void genfs_rename_exit(const struct genfs_rename_ops *ops, struct mount *mp, struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp) { (void)ops; KASSERT(ops != NULL); KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != tvp); KASSERT(tdvp != fvp); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); if ((tvp != NULL) && (tvp != fvp)) VOP_UNLOCK(tvp); VOP_UNLOCK(fvp); if (tvp != NULL) vrele(tvp); if (tdvp != fdvp) VOP_UNLOCK(tdvp); vrele(fvp); VOP_UNLOCK(fdvp); } /* * genfs_rename_remove: Remove the entry for the non-directory vp with * componentname cnp from the directory dvp, using the lookup results * de. It is the responsibility of gro_remove to purge the name cache. * * Everything must be locked and referenced. */ static int genfs_rename_remove(const struct genfs_rename_ops *ops, struct mount *mp, kauth_cred_t cred, struct vnode *dvp, struct componentname *cnp, void *de, struct vnode *vp, nlink_t *tvp_nlinkp) { int error; KASSERT(ops != NULL); KASSERT(mp != NULL); KASSERT(dvp != NULL); KASSERT(cnp != NULL); KASSERT(vp != NULL); KASSERT(dvp != vp); KASSERT(dvp->v_type == VDIR); KASSERT(vp->v_type != VDIR); KASSERT(dvp->v_mount == mp); KASSERT(vp->v_mount == mp); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); error = ops->gro_remove_check_possible(mp, dvp, vp); if (error) return error; error = ops->gro_remove_check_permitted(mp, cred, dvp, vp); error = kauth_authorize_vnode(cred, KAUTH_VNODE_DELETE, vp, dvp, error); if (error) return error; error = ops->gro_remove(mp, cred, dvp, cnp, de, vp, tvp_nlinkp); if (error) return error; return 0; } static int genfs_ufslike_check_sticky(kauth_cred_t, mode_t, uid_t, struct vnode *, uid_t); /* * genfs_ufslike_rename_check_possible: Check whether a rename is * possible independent of credentials, assuming UFS-like inode flag * semantics. clobber_p is true iff the target node already exists. */ int genfs_ufslike_rename_check_possible( unsigned long fdflags, unsigned long fflags, unsigned long tdflags, unsigned long tflags, bool clobber_p, unsigned long immutable, unsigned long append) { if ((fdflags | fflags) & (immutable | append)) return EPERM; if (tdflags & (immutable | (clobber_p? append : 0))) return EPERM; if (clobber_p && (tflags & (immutable | append))) return EPERM; return 0; } /* * genfs_ufslike_rename_check_permitted: Check whether a rename is * permitted given our credentials, assuming UFS-like permission and * ownership semantics. * * The only pair of vnodes that may be identical is {fdvp, tdvp}. * * Everything must be locked and referenced. */ int genfs_ufslike_rename_check_permitted(kauth_cred_t cred, struct vnode *fdvp, mode_t fdmode, uid_t fduid, struct vnode *fvp, uid_t fuid, struct vnode *tdvp, mode_t tdmode, uid_t tduid, struct vnode *tvp, uid_t tuid) { int error; KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == fvp->v_mount); KASSERT(fdvp->v_mount == tdvp->v_mount); KASSERT((tvp == NULL) || (fdvp->v_mount == tvp->v_mount)); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); /* * We need to remove or change an entry in the source directory. */ error = VOP_ACCESS(fdvp, VWRITE, cred); if (error) return error; /* * If we are changing directories, then we need to write to the * target directory to add or change an entry. Also, if fvp is * a directory, we need to write to it to change its `..' * entry. */ if (fdvp != tdvp) { error = VOP_ACCESS(tdvp, VWRITE, cred); if (error) return error; if (fvp->v_type == VDIR) { error = VOP_ACCESS(fvp, VWRITE, cred); if (error) return error; } } error = genfs_ufslike_check_sticky(cred, fdmode, fduid, fvp, fuid); if (error) return error; error = genfs_ufslike_check_sticky(cred, tdmode, tduid, tvp, tuid); if (error) return error; return 0; } /* * genfs_ufslike_remove_check_possible: Check whether a remove is * possible independent of credentials, assuming UFS-like inode flag * semantics. */ int genfs_ufslike_remove_check_possible(unsigned long dflags, unsigned long flags, unsigned long immutable, unsigned long append) { /* * We want to delete the entry. If the directory is immutable, * we can't write to it to delete the entry. If the directory * is append-only, the only change we can make is to add * entries, so we can't delete entries. If the node is * immutable, we can't change the links to it, so we can't * delete the entry. If the node is append-only...well, this * is what UFS does. */ if ((dflags | flags) & (immutable | append)) return EPERM; return 0; } /* * genfs_ufslike_remove_check_permitted: Check whether a remove is * permitted given our credentials, assuming UFS-like permission and * ownership semantics. * * Everything must be locked and referenced. */ int genfs_ufslike_remove_check_permitted(kauth_cred_t cred, struct vnode *dvp, mode_t dmode, uid_t duid, struct vnode *vp, uid_t uid) { int error; KASSERT(dvp != NULL); KASSERT(vp != NULL); KASSERT(dvp != vp); KASSERT(dvp->v_type == VDIR); KASSERT(vp->v_type != VDIR); KASSERT(dvp->v_mount == vp->v_mount); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); /* * We need to write to the directory to remove from it. */ error = VOP_ACCESS(dvp, VWRITE, cred); if (error) return error; error = genfs_ufslike_check_sticky(cred, dmode, duid, vp, uid); if (error) return error; return 0; } /* * genfs_ufslike_check_sticky: Check whether a party with credentials * cred may change an entry in a sticky directory, assuming UFS-like * permission, ownership, and stickiness semantics: If the directory is * sticky and the entry exists, the user must own either the directory * or the entry's node in order to change the entry. * * Everything must be locked and referenced. */ int genfs_ufslike_check_sticky(kauth_cred_t cred, mode_t dmode, uid_t duid, struct vnode *vp, uid_t uid) { if ((dmode & S_ISTXT) && (vp != NULL)) return genfs_can_sticky(vp, cred, duid, uid); return 0; }
9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 /* $NetBSD: layer_subr.c,v 1.39 2022/04/10 09:50:46 andvar Exp $ */ /* * Copyright (c) 1999 National Aeronautics & Space Administration * All rights reserved. * * This software was written by William Studenmund of the * Numerical Aerospace Simulation Facility, NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the National Aeronautics & Space Administration * nor the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB- * UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Id: lofs_subr.c,v 1.11 1992/05/30 10:05:43 jsp Exp * @(#)null_subr.c 8.7 (Berkeley) 5/14/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: layer_subr.c,v 1.39 2022/04/10 09:50:46 andvar Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/time.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/kmem.h> #include <miscfs/genfs/layer.h> #include <miscfs/genfs/layer_extern.h> #ifdef LAYERFS_DIAGNOSTIC int layerfs_debug = 1; #endif /* * layer cache: * Each cache entry holds a reference to the lower vnode * along with a pointer to the alias vnode. When an * entry is added the lower vnode is VREF'd. When the * alias is removed the lower vnode is vrele'd. */ void layerfs_init(void) { /* Nothing. */ } void layerfs_done(void) { /* Nothing. */ } /* * layer_node_create: try to find an existing layerfs vnode referring to it, * otherwise make a new vnode which contains a reference to the lower vnode. */ int layer_node_create(struct mount *mp, struct vnode *lowervp, struct vnode **nvpp) { int error; struct vnode *aliasvp; error = vcache_get(mp, &lowervp, sizeof(lowervp), &aliasvp); if (error) return error; /* * Now that we acquired a reference on the upper vnode, release one * on the lower node. The existence of the layer_node retains one * reference to the lower node. */ vrele(lowervp); KASSERT(vrefcnt(lowervp) > 0); #ifdef LAYERFS_DIAGNOSTIC if (layerfs_debug) vprint("layer_node_create: alias", aliasvp); #endif *nvpp = aliasvp; return 0; } #ifdef LAYERFS_DIAGNOSTIC struct vnode * layer_checkvp(struct vnode *vp, const char *fil, int lno) { struct layer_node *a = VTOLAYER(vp); #ifdef notyet /* * Can't do this check because vop_reclaim runs * with a funny vop vector. * * WRS - no it doesnt... */ if (vp->v_op != layer_vnodeop_p) { printf ("layer_checkvp: on non-layer-node\n"); #ifdef notyet while (layer_checkvp_barrier) /*WAIT*/ ; #endif panic("layer_checkvp"); }; #endif if (a->layer_lowervp == NULL) { /* Should never happen */ int i; u_long *p; printf("vp = %p, ZERO ptr\n", vp); for (p = (u_long *) a, i = 0; i < 8; i++) printf(" %lx", p[i]); printf("\n"); /* wait for debugger */ panic("layer_checkvp"); } if (vrefcnt(a->layer_lowervp) < 1) { int i; u_long *p; printf("vp = %p, unref'ed lowervp\n", vp); for (p = (u_long *) a, i = 0; i < 8; i++) printf(" %lx", p[i]); printf("\n"); /* wait for debugger */ panic ("layer with unref'ed lowervp"); }; #ifdef notnow printf("layer %p/%d -> %p/%d [%s, %d]\n", LAYERTOV(a), vrefcnt(LAYERTOV(a)), a->layer_lowervp, vrefcnt(a->layer_lowervp), fil, lno); #endif return a->layer_lowervp; } #endif
171 41 245 248 6 246 248 5 88 5 8 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 /* $NetBSD: lwp.h,v 1.231 2023/11/02 10:31:55 martin Exp $ */ /* * Copyright (c) 2001, 2006, 2007, 2008, 2009, 2010, 2019, 2020, 2023 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Nathan J. Williams and Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _SYS_LWP_H_ #define _SYS_LWP_H_ #if defined(_KERNEL) || defined(_KMEMUSER) #include <sys/param.h> #include <sys/callout.h> #include <sys/condvar.h> #include <sys/kcpuset.h> #include <sys/mutex.h> #include <sys/queue.h> #include <sys/resource.h> #include <sys/sched.h> #include <sys/signalvar.h> #include <sys/specificdata.h> #include <sys/time.h> #include <sys/wchan.h> #if defined(_KERNEL) struct lwp; /* forward declare this for <machine/cpu.h> so it can get l_cpu. */ static __inline struct cpu_info *lwp_getcpu(struct lwp *); #include <machine/cpu.h> /* curcpu() and cpu_info */ #include <sys/atomic.h> #ifdef _KERNEL_OPT #include "opt_kcov.h" #include "opt_kmsan.h" #include "opt_maxlwp.h" #endif #endif #include <machine/proc.h> /* Machine-dependent proc substruct. */ /* * Lightweight process. Field markings and the corresponding locks: * * a: proc_lock * c: condition variable interlock, passed to cv_wait() * l: *l_mutex * p: l_proc->p_lock * s: spc_mutex, which may or may not be referenced by l_mutex * S: l_selcluster->sc_lock * (: unlocked, stable * !: unlocked, may only be reliably accessed by the LWP itself * * Fields are clustered together by usage (to increase the likelihood * of cache hits) and by size (to reduce dead space in the structure). */ #include <sys/pcu.h> struct lockdebug; struct sysent; struct lwp { /* Must not be zeroed on free. */ struct cpu_info *volatile l_cpu;/* s: CPU we're on if LSONPROC */ kmutex_t * volatile l_mutex; /* l: ptr to mutex on sched state */ struct turnstile *l_ts; /* l: current turnstile */ int l_stat; /* l: overall LWP status */ int l__reserved; /* : padding - reuse as needed */ /* Scheduling and overall state. */ #define l_startzero l_runq TAILQ_ENTRY(lwp) l_runq; /* s: run queue */ union { void * info; /* s: scheduler-specific structure */ u_int timeslice; /* l: time-quantum for SCHED_M2 */ } l_sched; void *l_addr; /* l: PCB address; use lwp_getpcb() */ struct mdlwp l_md; /* l: machine-dependent fields. */ struct bintime l_rtime; /* l: real time */ struct bintime l_stime; /* l: start time (while ONPROC) */ int l_flag; /* l: misc flag values */ u_int l_swtime; /* l: time swapped in or out */ u_int l_rticks; /* l: Saved start time of run */ u_int l_rticksum; /* l: Sum of ticks spent running */ u_int l_slpticks; /* l: Saved start time of sleep */ u_int l_slpticksum; /* l: Sum of ticks spent sleeping */ int l_class; /* l: scheduling class */ pri_t l_boostpri; /* l: boosted priority after blocking */ pri_t l_priority; /* l: scheduler priority */ pri_t l_inheritedprio;/* l: inherited priority */ pri_t l_protectprio; /* l: for PTHREAD_PRIO_PROTECT */ pri_t l_auxprio; /* l: max(inherit,protect) priority */ int l_protectdepth; /* l: for PTHREAD_PRIO_PROTECT */ u_int l_cpticks; /* (: Ticks of CPU time */ psetid_t l_psid; /* l: assigned processor-set ID */ fixpt_t l_pctcpu; /* p: %cpu during l_swtime */ fixpt_t l_estcpu; /* l: cpu time for SCHED_4BSD */ SLIST_HEAD(, turnstile) l_pi_lenders; /* l: ts lending us priority */ struct cpu_info *l_target_cpu; /* l: target CPU to migrate */ struct lwpctl *l_lwpctl; /* p: lwpctl block kernel address */ struct lcpage *l_lcpage; /* p: lwpctl containing page */ kcpuset_t *l_affinity; /* l: CPU set for affinity */ /* Synchronisation. */ const struct syncobj *l_syncobj;/* l: sync object operations set */ LIST_ENTRY(lwp) l_sleepchain; /* l: sleep queue */ wchan_t l_wchan; /* l: sleep address */ const char *l_wmesg; /* l: reason for sleep */ struct sleepq *l_sleepq; /* l: current sleep queue */ callout_t l_timeout_ch; /* !: callout for tsleep */ kcondvar_t l_waitcv; /* a: vfork() wait */ u_int l_slptime; /* l: time since last blocked */ bool l_vforkwaiting; /* a: vfork() waiting */ /* User-space synchronization. */ uintptr_t l_robust_head; /* !: list of robust futexes */ uint32_t l___rsvd1; /* reserved for future use */ #if PCU_UNIT_COUNT > 0 struct cpu_info * volatile l_pcu_cpu[PCU_UNIT_COUNT]; uint32_t l_pcu_valid; #endif /* Process level and global state, misc. */ lwpid_t l_lid; /* (: LWP identifier; local to proc */ LIST_ENTRY(lwp) l_list; /* a: entry on list of all LWPs */ void *l_ctxlink; /* p: uc_link {get,set}context */ struct proc *l_proc; /* p: parent process */ LIST_ENTRY(lwp) l_sibling; /* p: entry on proc's list of LWPs */ char *l_name; /* (: name, optional */ lwpid_t l_waiter; /* p: first LWP waiting on us */ lwpid_t l_waitingfor; /* p: specific LWP we are waiting on */ int l_prflag; /* p: process level flags */ u_int l_refcnt; /* p: reference count on this LWP */ /* State of select() or poll(). */ int l_selflag; /* S: polling state flags */ int l_selret; /* S: return value of select/poll */ SLIST_HEAD(,selinfo) l_selwait; /* S: descriptors waited on */ uintptr_t l_selrec; /* !: argument for selrecord() */ struct selcluster *l_selcluster;/* !: associated cluster data */ void * l_selbits; /* (: select() bit-field */ size_t l_selni; /* (: size of a single bit-field */ /* Signals. */ int l_sigrestore; /* p: need to restore old sig mask */ sigset_t l_sigwaitset; /* p: signals being waited for */ kcondvar_t l_sigcv; /* p: for sigsuspend() */ struct ksiginfo *l_sigwaited; /* p: delivered signals from set */ sigpend_t *l_sigpendset; /* p: XXX issignal()/postsig() baton */ LIST_ENTRY(lwp) l_sigwaiter; /* p: chain on list of waiting LWPs */ stack_t l_sigstk; /* p: sp & on stack state variable */ sigset_t l_sigmask; /* p: signal mask */ sigpend_t l_sigpend; /* p: signals to this LWP */ sigset_t l_sigoldmask; /* p: mask for sigpause */ /* Private data. */ specificdata_reference l_specdataref; /* !: subsystem lwp-specific data */ struct timespec l_ktrcsw; /* !: for ktrace CSW trace XXX */ void *l_private; /* !: svr4-style lwp-private data */ struct lwp *l_switchto; /* !: mi_switch: switch to this LWP */ struct kauth_cred *l_cred; /* !: cached credentials */ struct filedesc *l_fd; /* !: cached copy of proc::p_fd */ void *l_emuldata; /* !: kernel lwp-private data */ struct fstrans_lwp_info *l_fstrans; /* (: fstrans private data */ u_short l_shlocks; /* !: lockdebug: shared locks held */ u_short l_exlocks; /* !: lockdebug: excl. locks held */ u_short l_psrefs; /* !: count of psref held */ u_short l_blcnt; /* !: count of kernel_lock held */ volatile int l_nopreempt; /* !: don't preempt me! */ volatile u_int l_dopreempt; /* s: kernel preemption pending */ int l_pflag; /* !: LWP private flags */ int l_dupfd; /* !: side return from cloning devs XXX */ const struct sysent * volatile l_sysent;/* !: currently active syscall */ struct rusage l_ru; /* !: accounting information */ uint64_t l_pfailtime; /* !: for kernel preemption */ uintptr_t l_pfailaddr; /* !: for kernel preemption */ uintptr_t l_pfaillock; /* !: for kernel preemption */ _TAILQ_HEAD(,struct lockdebug,volatile) l_ld_locks;/* !: locks held by LWP */ volatile void *l_ld_wanted; /* !: lock currently wanted by LWP */ uintptr_t l_rwcallsite; /* !: rwlock actual callsite */ int l_tcgen; /* !: for timecounter removal */ /* These are only used by 'options SYSCALL_TIMES'. */ uint32_t l_syscall_time; /* !: time epoch for current syscall */ uint64_t *l_syscall_counter; /* !: counter for current process */ struct kdtrace_thread *l_dtrace; /* (: DTrace-specific data. */ #ifdef KMSAN void *l_kmsan; /* !: KMSAN private data. */ #endif #ifdef KCOV void *l_kcov; /* !: KCOV private data. */ #endif }; /* * UAREA_PCB_OFFSET: an offset of PCB structure in the uarea. MD code may * define it in <machine/proc.h>, to indicate a different uarea layout. */ #ifndef UAREA_PCB_OFFSET #define UAREA_PCB_OFFSET 0 #endif LIST_HEAD(lwplist, lwp); /* A list of LWPs. */ #ifdef _KERNEL extern struct lwplist alllwp; /* List of all LWPs. */ extern lwp_t lwp0; /* LWP for proc0. */ extern int maxlwp __read_mostly; /* max number of lwps */ #ifndef MAXLWP #define MAXLWP 4096 /* default max */ #endif #ifndef MAXMAXLWP #define MAXMAXLWP 65535 /* absolute max */ #endif #endif #endif /* _KERNEL || _KMEMUSER */ /* * These flags are kept in l_flag, and they are modified only with the LWP * locked. */ #define LW_IDLE 0x00000001 /* Idle lwp. */ #define LW_LWPCTL 0x00000002 /* Adjust lwpctl in userret */ #define LW_STIMO 0x00000040 /* Sleep timed out */ #define LW_SINTR 0x00000080 /* Sleep is interruptible. */ #define LW_CATCHINTR 0x00000100 /* LW_SINTR intent; see sleepq_block(). */ #define LW_SYSTEM 0x00000200 /* Kernel thread */ #define LW_SYSTEM_FPU 0x00000400 /* Kernel thread with vector/FP enabled */ #define LW_DBGSUSPEND 0x00010000 /* Suspend by debugger */ #define LW_WSUSPEND 0x00020000 /* Suspend before return to user */ #define LW_BATCH 0x00040000 /* LWP tends to hog CPU */ #define LW_WCORE 0x00080000 /* Stop for core dump on return to user */ #define LW_WEXIT 0x00100000 /* Exit before return to user */ #define LW_PENDSIG 0x01000000 /* Pending signal for us */ #define LW_CANCELLED 0x02000000 /* tsleep should not sleep */ #define LW_CACHECRED 0x04000000 /* Cache new process credential */ #define LW_WREBOOT 0x08000000 /* System is rebooting, please suspend */ #define LW_UNPARKED 0x10000000 /* Unpark op pending */ #define LW_RUMP_CLEAR 0x40000000 /* Clear curlwp in RUMP scheduler */ #define LW_RUMP_QEXIT 0x80000000 /* LWP should exit ASAP */ /* * The second set of flags is kept in l_pflag, and they are modified only by * the LWP itself, or modified when it's known the LWP cannot be running. * LP_RUNNING is typically updated with the LWP locked, but not always in * the case of soft interrupt handlers. */ #define LP_KTRACTIVE 0x00000001 /* Executing ktrace operation */ #define LP_KTRCSW 0x00000002 /* ktrace context switch marker */ #define LP_KTRCSWUSER 0x00000004 /* ktrace context switch marker */ /* 0x00000008 was LP_PIDLID */ #define LP_OWEUPC 0x00000010 /* Owe user profiling tick */ #define LP_MPSAFE 0x00000020 /* Starts life without kernel_lock */ #define LP_INTR 0x00000040 /* Soft interrupt handler */ #define LP_SYSCTLWRITE 0x00000080 /* sysctl write lock held */ #define LP_MUSTJOIN 0x00000100 /* Must join kthread on exit */ #define LP_SINGLESTEP 0x00000400 /* Single step thread in ptrace(2) */ #define LP_TIMEINTR 0x00010000 /* Time this soft interrupt */ #define LP_PREEMPTING 0x00020000 /* mi_switch called involuntarily */ #define LP_RUNNING 0x20000000 /* Active on a CPU */ #define LP_TELEPORT 0x40000000 /* Teleport to new CPU on preempt() */ #define LP_BOUND 0x80000000 /* Bound to a CPU */ /* * The third set of flags is kept in l_prflag and they are modified only * with p_lock held. */ #define LPR_DETACHED 0x00800000 /* Won't be waited for. */ #define LPR_DRAINING 0x80000000 /* Draining references before exiting */ /* * Mask indicating that there is "exceptional" work to be done on return to * user. */ #define LW_USERRET (LW_WEXIT | LW_PENDSIG | LW_WREBOOT | LW_WSUSPEND \ | LW_WCORE | LW_LWPCTL | LW_CACHECRED) /* * Status values. * * A note about LSRUN and LSONPROC: LSRUN indicates that a process is * runnable but *not* yet running, i.e. is on a run queue. LSONPROC * indicates that the process is actually executing on a CPU, i.e. * it is no longer on a run queue. * * These values are set in stone and must not be reused with future changes. */ #define LSIDL 1 /* Process being created by fork. */ #define LSRUN 2 /* Currently runnable. */ #define LSSLEEP 3 /* Sleeping on an address. */ #define LSSTOP 4 /* Process debugging or suspension. */ #define LSZOMB 5 /* Awaiting collection by parent. */ /* define LSDEAD 6 Process is almost a zombie. (removed in 5.0) */ #define LSONPROC 7 /* Process is currently on a CPU. */ #define LSSUSPENDED 8 /* Not running, not signalable. */ #if defined(_KERNEL) || defined(_KMEMUSER) static __inline void * lwp_getpcb(struct lwp *l) { return l->l_addr; } #endif /* _KERNEL || _KMEMUSER */ #ifdef _KERNEL void lwpinit(void); void lwp0_init(void); void lwp_startup(lwp_t *, lwp_t *); void startlwp(void *); void lwp_lock(lwp_t *); void lwp_unlock(lwp_t *); pri_t lwp_eprio(lwp_t *); int lwp_locked(lwp_t *, kmutex_t *); kmutex_t *lwp_setlock(lwp_t *, kmutex_t *); void lwp_unlock_to(lwp_t *, kmutex_t *); int lwp_trylock(lwp_t *); void lwp_changepri(lwp_t *, pri_t); void lwp_lendpri(lwp_t *, pri_t); void lwp_addref(lwp_t *); void lwp_delref(lwp_t *); void lwp_delref2(lwp_t *); bool lwp_drainrefs(lwp_t *); bool lwp_alive(lwp_t *); lwp_t *lwp_find_first(proc_t *); int lwp_wait(lwp_t *, lwpid_t, lwpid_t *, bool); void lwp_continue(lwp_t *); void lwp_unsleep(lwp_t *, bool); void lwp_unstop(lwp_t *); void lwp_exit(lwp_t *); int lwp_suspend(lwp_t *, lwp_t *); int lwp_create1(lwp_t *, const void *, size_t, u_long, lwpid_t *); void lwp_start(lwp_t *, int); void lwp_migrate(lwp_t *, struct cpu_info *); lwp_t * lwp_find2(pid_t, lwpid_t); lwp_t * lwp_find(proc_t *, int); void lwp_userret(lwp_t *); void lwp_need_userret(lwp_t *); void lwp_free(lwp_t *, bool, bool); long lwp_pctr(void); int lwp_setprivate(lwp_t *, void *); int do_lwp_create(lwp_t *, void *, u_long, lwp_t **, const sigset_t *, const stack_t *); void lwp_thread_cleanup(lwp_t *); void lwpinit_specificdata(void); int lwp_specific_key_create(specificdata_key_t *, specificdata_dtor_t); void lwp_specific_key_delete(specificdata_key_t); void lwp_initspecific(lwp_t *); void lwp_finispecific(lwp_t *); void *lwp_getspecific(specificdata_key_t); #if defined(_LWP_API_PRIVATE) void *_lwp_getspecific_by_lwp(lwp_t *, specificdata_key_t); #endif void lwp_setspecific(specificdata_key_t, void *); void lwp_setspecific_by_lwp(lwp_t *, specificdata_key_t, void *); /* Syscalls. */ int lwp_park(clockid_t, int, struct timespec *); int lwp_unpark(const lwpid_t *, const u_int); /* DDB. */ void lwp_whatis(uintptr_t, void (*)(const char *, ...) __printflike(1, 2)); int lwp_create(lwp_t *, struct proc *, vaddr_t, int, void *, size_t, void (*)(void *), void *, lwp_t **, int, const sigset_t *, const stack_t *); /* * XXX _MODULE * We should provide real stubs for the below that modules can use. */ static __inline void spc_lock(struct cpu_info *ci) { mutex_spin_enter(ci->ci_schedstate.spc_mutex); } static __inline void spc_unlock(struct cpu_info *ci) { mutex_spin_exit(ci->ci_schedstate.spc_mutex); } static __inline void spc_dlock(struct cpu_info *ci1, struct cpu_info *ci2) { struct schedstate_percpu *spc1 = &ci1->ci_schedstate; struct schedstate_percpu *spc2 = &ci2->ci_schedstate; KASSERT(ci1 != ci2); if (ci1 < ci2) { mutex_spin_enter(spc1->spc_mutex); mutex_spin_enter(spc2->spc_mutex); } else { mutex_spin_enter(spc2->spc_mutex); mutex_spin_enter(spc1->spc_mutex); } } /* * Allow machine-dependent code to override curlwp in <machine/cpu.h> for * its own convenience. Otherwise, we declare it as appropriate. */ #if !defined(curlwp) #if defined(MULTIPROCESSOR) #define curlwp curcpu()->ci_curlwp /* Current running LWP */ #else extern struct lwp *curlwp; /* Current running LWP */ #endif /* MULTIPROCESSOR */ #endif /* ! curlwp */ #define curproc (curlwp->l_proc) /* * This provides a way for <machine/cpu.h> to get l_cpu for curlwp before * struct lwp is defined. */ static __inline struct cpu_info * lwp_getcpu(struct lwp *l) { return l->l_cpu; } static __inline bool CURCPU_IDLE_P(void) { struct cpu_info *ci = curcpu(); return ci->ci_onproc == ci->ci_data.cpu_idlelwp; } /* * Disable and re-enable preemption. Only for low-level kernel * use. Device drivers and anything that could potentially be * compiled as a module should use kpreempt_disable() and * kpreempt_enable(). */ static __inline void KPREEMPT_DISABLE(lwp_t *l) { struct lwp *l1 __diagused; KASSERTMSG(l == (l1 = curlwp), "l=%p curlwp=%p", l, l1); l->l_nopreempt++; __insn_barrier(); } static __inline void KPREEMPT_ENABLE(lwp_t *l) { struct lwp *l1 __diagused; KASSERTMSG(l == (l1 = curlwp), "l=%p curlwp=%p", l, l1); KASSERT(l->l_nopreempt > 0); __insn_barrier(); l->l_nopreempt--; __insn_barrier(); if (__predict_false(l->l_dopreempt)) kpreempt(0); } /* For lwp::l_dopreempt */ #define DOPREEMPT_ACTIVE 0x01 #define DOPREEMPT_COUNTED 0x02 /* * Prevent curlwp from migrating between CPUs between curlwp_bind and * curlwp_bindx. One use case is psref(9) that has a contract that * forbids migrations. */ static __inline int curlwp_bind(void) { int bound; bound = curlwp->l_pflag & LP_BOUND; curlwp->l_pflag |= LP_BOUND; __insn_barrier(); return bound; } static __inline void curlwp_bindx(int bound) { KASSERT(curlwp->l_pflag & LP_BOUND); __insn_barrier(); curlwp->l_pflag ^= bound ^ LP_BOUND; } #endif /* _KERNEL */ /* Flags for _lwp_create(), as per Solaris. */ #define LWP_DETACHED 0x00000040 #define LWP_SUSPENDED 0x00000080 /* Kernel-internal flags for LWP creation. */ /* 0x40000000 was LWP_PIDLID */ #define LWP_VFORK 0x80000000 #endif /* !_SYS_LWP_H_ */
44 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 /* $NetBSD: sleeptab.h,v 1.3 2023/10/15 10:27:11 riastradh Exp $ */ /*- * Copyright (c) 2002, 2006, 2007, 2008, 2009, 2019, 2020 * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe and Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _SYS_SLEEPTAB_H_ #define _SYS_SLEEPTAB_H_ #include <sys/wchan.h> struct syncobj; #define SLEEPTAB_HASH_SHIFT 7 #define SLEEPTAB_HASH_SIZE (1 << SLEEPTAB_HASH_SHIFT) #define SLEEPTAB_HASH_MASK (SLEEPTAB_HASH_SIZE - 1) #define SLEEPTAB_HASH(wchan) (((uintptr_t)(wchan) >> 8) & SLEEPTAB_HASH_MASK) LIST_HEAD(sleepq, lwp); typedef struct sleeptab { sleepq_t st_queue[SLEEPTAB_HASH_SIZE]; } sleeptab_t; void sleeptab_init(sleeptab_t *); extern sleeptab_t sleeptab; #ifdef _KERNEL /* * Find the correct sleep queue for the specified wait channel. This * acquires and holds the per-queue interlock. */ static __inline sleepq_t * sleeptab_lookup(sleeptab_t *st, wchan_t wchan, kmutex_t **mp) { extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE]; sleepq_t *sq; u_int hash; hash = SLEEPTAB_HASH(wchan); sq = &st->st_queue[hash]; *mp = &sleepq_locks[hash].lock; mutex_spin_enter(*mp); return sq; } static __inline kmutex_t * sleepq_hashlock(wchan_t wchan) { extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE]; kmutex_t *mp; mp = &sleepq_locks[SLEEPTAB_HASH(wchan)].lock; mutex_spin_enter(mp); return mp; } #define sleepq_destroy(a) __nothing #endif /* * Turnstiles, specialized sleep queues for use by kernel locks. */ typedef struct turnstile { LIST_ENTRY(turnstile) ts_chain; /* link on hash chain */ struct turnstile *ts_free; /* turnstile free list */ wchan_t ts_obj; /* lock object */ sleepq_t ts_sleepq[2]; /* sleep queues */ u_int ts_waiters[2]; /* count of waiters */ /* priority inheritance */ pri_t ts_eprio; lwp_t *ts_inheritor; SLIST_ENTRY(turnstile) ts_pichain; } turnstile_t; LIST_HEAD(tschain, turnstile); typedef struct tschain tschain_t; #define TS_READER_Q 0 /* reader sleep queue */ #define TS_WRITER_Q 1 /* writer sleep queue */ #define TS_WAITERS(ts, q) \ (ts)->ts_waiters[(q)] #define TS_ALL_WAITERS(ts) \ ((ts)->ts_waiters[TS_READER_Q] + \ (ts)->ts_waiters[TS_WRITER_Q]) #define TS_FIRST(ts, q) (LIST_FIRST(&(ts)->ts_sleepq[(q)])) #ifdef _KERNEL void turnstile_init(void); turnstile_t *turnstile_lookup(wchan_t); void turnstile_ctor(turnstile_t *); void turnstile_exit(wchan_t); void turnstile_block(turnstile_t *, int, wchan_t, const struct syncobj *); void turnstile_wakeup(turnstile_t *, int, int, lwp_t *); void turnstile_print(volatile void *, void (*)(const char *, ...) __printflike(1, 2)); void turnstile_unsleep(lwp_t *, bool); void turnstile_changepri(lwp_t *, pri_t); extern struct pool turnstile_pool; extern turnstile_t turnstile0; #endif /* _KERNEL */ #endif /* _SYS_SLEEPTAB_H_ */
23 23 23 23 23 23 22 23 23 23 23 22 23 22 23 23 22 22 18 23 9 22 23 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 /* $NetBSD: vioscsi.c,v 1.36 2023/03/25 11:04:34 mlelstv Exp $ */ /* $OpenBSD: vioscsi.c,v 1.3 2015/03/14 03:38:49 jsg Exp $ */ /* * Copyright (c) 2013 Google Inc. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vioscsi.c,v 1.36 2023/03/25 11:04:34 mlelstv Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/device.h> #include <sys/bus.h> #include <sys/buf.h> #include <sys/module.h> #include <dev/pci/vioscsireg.h> #include <dev/pci/virtiovar.h> #include <dev/scsipi/scsi_all.h> #include <dev/scsipi/scsiconf.h> #ifdef VIOSCSI_DEBUG static int vioscsi_debug = 1; #define DPRINTF(f) do { if (vioscsi_debug) printf f; } while (/*CONSTCOND*/0) #else #define DPRINTF(f) ((void)0) #endif struct vioscsi_req { struct virtio_scsi_req_hdr vr_req; struct virtio_scsi_res_hdr vr_res; struct scsipi_xfer *vr_xs; bus_dmamap_t vr_control; bus_dmamap_t vr_data; }; struct vioscsi_softc { device_t sc_dev; struct scsipi_adapter sc_adapter; struct scsipi_channel sc_channel; struct virtqueue sc_vqs[3]; #define VIOSCSI_VQ_CONTROL 0 #define VIOSCSI_VQ_EVENT 1 #define VIOSCSI_VQ_REQUEST 2 struct vioscsi_req *sc_reqs; int sc_nreqs; bus_dma_segment_t sc_reqs_segs[1]; u_int32_t sc_seg_max; kmutex_t sc_mutex; }; /* * Each block request uses at least two segments - one for the header * and one for the status. */ #define VIRTIO_SCSI_MIN_SEGMENTS 2 static int vioscsi_match(device_t, cfdata_t, void *); static void vioscsi_attach(device_t, device_t, void *); static int vioscsi_detach(device_t, int); static int vioscsi_alloc_reqs(struct vioscsi_softc *, struct virtio_softc *, int); static void vioscsi_free_reqs(struct vioscsi_softc *, struct virtio_softc *); static void vioscsi_scsipi_request(struct scsipi_channel *, scsipi_adapter_req_t, void *); static int vioscsi_vq_done(struct virtqueue *); static void vioscsi_req_done(struct vioscsi_softc *, struct virtio_softc *, struct vioscsi_req *, struct virtqueue *, int); static struct vioscsi_req *vioscsi_req_get(struct vioscsi_softc *); static void vioscsi_bad_target(struct scsipi_xfer *); static const char *const vioscsi_vq_names[] = { "control", "event", "request", }; CFATTACH_DECL3_NEW(vioscsi, sizeof(struct vioscsi_softc), vioscsi_match, vioscsi_attach, vioscsi_detach, NULL, NULL, NULL, DVF_DETACH_SHUTDOWN); static int vioscsi_match(device_t parent, cfdata_t match, void *aux) { struct virtio_attach_args *va = aux; if (va->sc_childdevid == VIRTIO_DEVICE_ID_SCSI) return 1; return 0; } static void vioscsi_attach(device_t parent, device_t self, void *aux) { struct vioscsi_softc *sc = device_private(self); struct virtio_softc *vsc = device_private(parent); struct scsipi_adapter *adapt = &sc->sc_adapter; struct scsipi_channel *chan = &sc->sc_channel; int rv, qsize = 0, i = 0; int ipl = IPL_BIO; if (virtio_child(vsc) != NULL) { aprint_error(": parent %s already has a child\n", device_xname(parent)); return; } sc->sc_dev = self; virtio_child_attach_start(vsc, self, ipl, 0, VIRTIO_COMMON_FLAG_BITS); mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, ipl); uint32_t cmd_per_lun = virtio_read_device_config_4(vsc, VIRTIO_SCSI_CONFIG_CMD_PER_LUN); uint32_t seg_max = virtio_read_device_config_4(vsc, VIRTIO_SCSI_CONFIG_SEG_MAX); uint16_t max_target = virtio_read_device_config_2(vsc, VIRTIO_SCSI_CONFIG_MAX_TARGET); uint32_t max_lun = virtio_read_device_config_4(vsc, VIRTIO_SCSI_CONFIG_MAX_LUN); sc->sc_seg_max = seg_max; for(i=0; i < __arraycount(sc->sc_vqs); i++) { virtio_init_vq_vqdone(vsc, &sc->sc_vqs[i], i, vioscsi_vq_done); rv = virtio_alloc_vq(vsc, &sc->sc_vqs[i], MAXPHYS, VIRTIO_SCSI_MIN_SEGMENTS + howmany(MAXPHYS, NBPG), vioscsi_vq_names[i]); if (rv) { aprint_error_dev(sc->sc_dev, "failed to allocate virtqueue %d\n", i); goto err; } if (i == VIOSCSI_VQ_REQUEST) sc->sc_vqs[i].vq_done = vioscsi_vq_done; } qsize = sc->sc_vqs[VIOSCSI_VQ_REQUEST].vq_num; if (vioscsi_alloc_reqs(sc, vsc, qsize)) goto err; aprint_normal_dev(sc->sc_dev, "cmd_per_lun %u qsize %d seg_max %u max_target %hu" " max_lun %u\n", cmd_per_lun, qsize, seg_max, max_target, max_lun); if (virtio_child_attach_finish(vsc, sc->sc_vqs, __arraycount(sc->sc_vqs), NULL, VIRTIO_F_INTR_MSIX | VIRTIO_F_INTR_MPSAFE) != 0) goto err; /* * Fill in the scsipi_adapter. */ memset(adapt, 0, sizeof(*adapt)); adapt->adapt_dev = sc->sc_dev; adapt->adapt_nchannels = 1; adapt->adapt_openings = MIN(qsize, cmd_per_lun); adapt->adapt_max_periph = adapt->adapt_openings; adapt->adapt_request = vioscsi_scsipi_request; adapt->adapt_minphys = minphys; adapt->adapt_flags = SCSIPI_ADAPT_MPSAFE; /* * Fill in the scsipi_channel. */ memset(chan, 0, sizeof(*chan)); chan->chan_adapter = adapt; chan->chan_bustype = &scsi_bustype; chan->chan_channel = 0; chan->chan_ntargets = MIN(1 + max_target, 256); /* cap reasonably */ chan->chan_nluns = MIN(1 + max_lun, 16384); /* cap reasonably */ chan->chan_id = max_target + 1; chan->chan_flags = SCSIPI_CHAN_NOSETTLE; config_found(self, &sc->sc_channel, scsiprint, CFARGS_NONE); return; err: if (qsize > 0) vioscsi_free_reqs(sc, vsc); for (i=0; i < __arraycount(sc->sc_vqs); i++) { virtio_free_vq(vsc, &sc->sc_vqs[i]); } virtio_child_attach_failed(vsc); } static int vioscsi_detach(device_t self, int flags) { struct vioscsi_softc *sc = device_private(self); struct virtio_softc *vsc = device_private(device_parent(sc->sc_dev)); int rc, i; /* * Dequeue all pending finished requests. Must be done * before we try to detach children so that we process * their pending requests while they still exist. */ if (sc->sc_vqs[VIOSCSI_VQ_REQUEST].vq_num > 0) vioscsi_vq_done(&sc->sc_vqs[VIOSCSI_VQ_REQUEST]); if ((rc = config_detach_children(self, flags)) != 0) return rc; virtio_reset(vsc); for (i = 0; i < __arraycount(sc->sc_vqs); i++) { if (sc->sc_vqs[i].vq_num > 0) virtio_free_vq(vsc, &sc->sc_vqs[i]); } vioscsi_free_reqs(sc, vsc); virtio_child_detach(vsc); mutex_destroy(&sc->sc_mutex); return 0; } #define XS2DMA(xs) \ ((((xs)->xs_control & XS_CTL_DATA_IN) ? BUS_DMA_READ : BUS_DMA_WRITE) | \ (((xs)->xs_control & XS_CTL_NOSLEEP) ? BUS_DMA_NOWAIT : BUS_DMA_WAITOK) | \ BUS_DMA_STREAMING) #define XS2DMAPRE(xs) (((xs)->xs_control & XS_CTL_DATA_IN) ? \ BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE) #define XS2DMAPOST(xs) (((xs)->xs_control & XS_CTL_DATA_IN) ? \ BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE) static void vioscsi_scsipi_request(struct scsipi_channel *chan, scsipi_adapter_req_t request, void *arg) { struct vioscsi_softc *sc = device_private(chan->chan_adapter->adapt_dev); struct virtio_softc *vsc = device_private(device_parent(sc->sc_dev)); struct scsipi_xfer *xs; struct scsipi_periph *periph; struct vioscsi_req *vr; struct virtio_scsi_req_hdr *req; struct virtqueue *vq = &sc->sc_vqs[VIOSCSI_VQ_REQUEST]; int slot, error; bool dopoll; DPRINTF(("%s: enter\n", __func__)); switch (request) { case ADAPTER_REQ_RUN_XFER: break; case ADAPTER_REQ_SET_XFER_MODE: { struct scsipi_xfer_mode *xm = arg; xm->xm_mode = PERIPH_CAP_TQING; xm->xm_period = 0; xm->xm_offset = 0; scsipi_async_event(chan, ASYNC_EVENT_XFER_MODE, xm); return; } default: DPRINTF(("%s: unhandled %d\n", __func__, request)); return; } xs = arg; periph = xs->xs_periph; /* * This can happen when we run out of queue slots. */ vr = vioscsi_req_get(sc); if (vr == NULL) { xs->error = XS_BUSY; scsipi_done(xs); return; } req = &vr->vr_req; slot = vr - sc->sc_reqs; /* * "The only supported format for the LUN field is: first byte set to * 1, second byte set to target, third and fourth byte representing a * single level LUN structure, followed by four zero bytes." */ if (periph->periph_target >= 256 || periph->periph_lun >= 16384 || periph->periph_target < 0 || periph->periph_lun < 0) { goto stuffup; } req->lun[0] = 1; req->lun[1] = periph->periph_target; req->lun[2] = 0x40 | ((periph->periph_lun >> 8) & 0x3F); req->lun[3] = periph->periph_lun & 0xFF; memset(req->lun + 4, 0, 4); DPRINTF(("%s: command %p for %d:%d at slot %d\n", __func__, xs, periph->periph_target, periph->periph_lun, slot)); /* tag */ switch (XS_CTL_TAGTYPE(xs)) { case XS_CTL_HEAD_TAG: req->task_attr = VIRTIO_SCSI_S_HEAD; break; #if 0 /* XXX */ case XS_CTL_ACA_TAG: req->task_attr = VIRTIO_SCSI_S_ACA; break; #endif case XS_CTL_ORDERED_TAG: req->task_attr = VIRTIO_SCSI_S_ORDERED; break; case XS_CTL_SIMPLE_TAG: default: req->task_attr = VIRTIO_SCSI_S_SIMPLE; break; } req->id = virtio_rw64(vsc, slot); if ((size_t)xs->cmdlen > sizeof(req->cdb)) { DPRINTF(("%s: bad cmdlen %zu > %zu\n", __func__, (size_t)xs->cmdlen, sizeof(req->cdb))); goto stuffup; } memset(req->cdb, 0, sizeof(req->cdb)); memcpy(req->cdb, xs->cmd, xs->cmdlen); error = bus_dmamap_load(virtio_dmat(vsc), vr->vr_data, xs->data, xs->datalen, NULL, XS2DMA(xs)); if (error) { aprint_error_dev(sc->sc_dev, "%s: error %d loading DMA map\n", __func__, error); if (error == ENOMEM || error == EAGAIN) { /* * Map is allocated with ALLOCNOW, so this should * actually never ever happen. */ xs->error = XS_RESOURCE_SHORTAGE; } else { stuffup: /* not a temporary condition */ xs->error = XS_DRIVER_STUFFUP; } virtio_enqueue_abort(vsc, vq, slot); scsipi_done(xs); return; } int nsegs = VIRTIO_SCSI_MIN_SEGMENTS; if ((xs->xs_control & (XS_CTL_DATA_IN|XS_CTL_DATA_OUT)) != 0) nsegs += vr->vr_data->dm_nsegs; error = virtio_enqueue_reserve(vsc, vq, slot, nsegs); if (error) { bus_dmamap_unload(virtio_dmat(vsc), vr->vr_data); /* slot already freed by virtio_enqueue_reserve() */ xs->error = XS_BUSY; scsipi_done(xs); return; } vr->vr_xs = xs; bus_dmamap_sync(virtio_dmat(vsc), vr->vr_control, offsetof(struct vioscsi_req, vr_req), sizeof(struct virtio_scsi_req_hdr), BUS_DMASYNC_PREWRITE); bus_dmamap_sync(virtio_dmat(vsc), vr->vr_control, offsetof(struct vioscsi_req, vr_res), sizeof(struct virtio_scsi_res_hdr), BUS_DMASYNC_PREREAD); if ((xs->xs_control & (XS_CTL_DATA_IN|XS_CTL_DATA_OUT)) != 0) bus_dmamap_sync(virtio_dmat(vsc), vr->vr_data, 0, xs->datalen, XS2DMAPRE(xs)); virtio_enqueue_p(vsc, vq, slot, vr->vr_control, offsetof(struct vioscsi_req, vr_req), sizeof(struct virtio_scsi_req_hdr), 1); if (xs->xs_control & XS_CTL_DATA_OUT) virtio_enqueue(vsc, vq, slot, vr->vr_data, 1); virtio_enqueue_p(vsc, vq, slot, vr->vr_control, offsetof(struct vioscsi_req, vr_res), sizeof(struct virtio_scsi_res_hdr), 0); if (xs->xs_control & XS_CTL_DATA_IN) virtio_enqueue(vsc, vq, slot, vr->vr_data, 0); dopoll = (xs->xs_control & XS_CTL_POLL) != 0; virtio_enqueue_commit(vsc, vq, slot, 1); if (!dopoll) return; DPRINTF(("%s: polling...\n", __func__)); // XXX: do this better. int timeout = 1000; do { virtio_intrhand(vsc); if (vr->vr_xs != xs) break; delay(1000); } while (--timeout > 0); if (vr->vr_xs == xs) { // XXX: Abort! xs->error = XS_TIMEOUT; xs->resid = xs->datalen; DPRINTF(("%s: polling timeout\n", __func__)); scsipi_done(xs); } DPRINTF(("%s: command %p done (timeout=%d)\n", __func__, xs, timeout)); } static void vioscsi_req_done(struct vioscsi_softc *sc, struct virtio_softc *vsc, struct vioscsi_req *vr, struct virtqueue *vq, int slot) { struct scsipi_xfer *xs = vr->vr_xs; size_t sense_len; DPRINTF(("%s: enter\n", __func__)); bus_dmamap_sync(virtio_dmat(vsc), vr->vr_control, offsetof(struct vioscsi_req, vr_req), sizeof(struct virtio_scsi_req_hdr), BUS_DMASYNC_POSTWRITE); bus_dmamap_sync(virtio_dmat(vsc), vr->vr_control, offsetof(struct vioscsi_req, vr_res), sizeof(struct virtio_scsi_res_hdr), BUS_DMASYNC_POSTREAD); if (xs->datalen) bus_dmamap_sync(virtio_dmat(vsc), vr->vr_data, 0, xs->datalen, XS2DMAPOST(xs)); xs->status = vr->vr_res.status; xs->resid = virtio_rw32(vsc, vr->vr_res.residual); switch (vr->vr_res.response) { case VIRTIO_SCSI_S_OK: sense_len = MIN(sizeof(xs->sense), virtio_rw32(vsc, vr->vr_res.sense_len)); memcpy(&xs->sense, vr->vr_res.sense, sense_len); xs->error = (sense_len == 0) ? XS_NOERROR : XS_SENSE; break; case VIRTIO_SCSI_S_BAD_TARGET: vioscsi_bad_target(xs); break; default: DPRINTF(("%s: stuffup: %d\n", __func__, vr->vr_res.response)); xs->error = XS_DRIVER_STUFFUP; xs->resid = xs->datalen; break; } DPRINTF(("%s: command %p done %d, %d, %d\n", __func__, xs, xs->error, xs->status, xs->resid)); bus_dmamap_unload(virtio_dmat(vsc), vr->vr_data); vr->vr_xs = NULL; virtio_dequeue_commit(vsc, vq, slot); mutex_exit(&sc->sc_mutex); scsipi_done(xs); mutex_enter(&sc->sc_mutex); } static void vioscsi_bad_target(struct scsipi_xfer *xs) { struct scsi_sense_data *sense = &xs->sense.scsi_sense; DPRINTF(("%s: bad target %d:%d\n", __func__, xs->xs_periph->periph_target, xs->xs_periph->periph_lun)); memset(sense, 0, sizeof(*sense)); sense->response_code = 0x70; sense->flags = SKEY_ILLEGAL_REQUEST; xs->error = XS_SENSE; xs->status = 0; xs->resid = 0; } static int vioscsi_vq_done(struct virtqueue *vq) { struct virtio_softc *vsc = vq->vq_owner; struct vioscsi_softc *sc = device_private(virtio_child(vsc)); int ret = 0; DPRINTF(("%s: enter %d\n", __func__, vq->vq_index)); mutex_enter(&sc->sc_mutex); for (;;) { int r, slot; r = virtio_dequeue(vsc, vq, &slot, NULL); if (r != 0) break; DPRINTF(("%s: slot=%d\n", __func__, slot)); vioscsi_req_done(sc, vsc, &sc->sc_reqs[slot], vq, slot); ret = 1; } mutex_exit(&sc->sc_mutex); DPRINTF(("%s: exit %d: %d\n", __func__, vq->vq_index, ret)); return ret; } static struct vioscsi_req * vioscsi_req_get(struct vioscsi_softc *sc) { struct virtio_softc *vsc = device_private(device_parent(sc->sc_dev)); struct virtqueue *vq = &sc->sc_vqs[VIOSCSI_VQ_REQUEST]; struct vioscsi_req *vr = NULL; int r, slot; mutex_enter(&sc->sc_mutex); if ((r = virtio_enqueue_prep(vsc, vq, &slot)) != 0) { DPRINTF(("%s: virtio_enqueue_get error %d\n", __func__, r)); goto out; } KASSERT(slot < sc->sc_nreqs); vr = &sc->sc_reqs[slot]; DPRINTF(("%s: %p, %d\n", __func__, vr, slot)); out: mutex_exit(&sc->sc_mutex); return vr; } static int vioscsi_alloc_reqs(struct vioscsi_softc *sc, struct virtio_softc *vsc, int qsize) { size_t allocsize; int r, rsegs, slot; void *vaddr; struct vioscsi_req *vr; allocsize = qsize * sizeof(struct vioscsi_req); r = bus_dmamem_alloc(virtio_dmat(vsc), allocsize, 0, 0, &sc->sc_reqs_segs[0], 1, &rsegs, BUS_DMA_NOWAIT); if (r != 0) { aprint_error_dev(sc->sc_dev, "%s: bus_dmamem_alloc, size %zu, error %d\n", __func__, allocsize, r); return r; } r = bus_dmamem_map(virtio_dmat(vsc), &sc->sc_reqs_segs[0], 1, allocsize, &vaddr, BUS_DMA_NOWAIT); if (r != 0) { aprint_error_dev(sc->sc_dev, "%s: bus_dmamem_map failed, error %d\n", __func__, r); bus_dmamem_free(virtio_dmat(vsc), &sc->sc_reqs_segs[0], 1); return r; } memset(vaddr, 0, allocsize); sc->sc_reqs = vaddr; sc->sc_nreqs = qsize; /* Prepare maps for the requests */ for (slot=0; slot < qsize; slot++) { vr = &sc->sc_reqs[slot]; r = bus_dmamap_create(virtio_dmat(vsc), offsetof(struct vioscsi_req, vr_xs), 1, offsetof(struct vioscsi_req, vr_xs), 0, BUS_DMA_NOWAIT|BUS_DMA_ALLOCNOW, &vr->vr_control); if (r != 0) { aprint_error_dev(sc->sc_dev, "%s: bus_dmamap_create ctrl failed, error %d\n", __func__, r); goto cleanup; } r = bus_dmamap_create(virtio_dmat(vsc), MAXPHYS, sc->sc_seg_max, MAXPHYS, 0, BUS_DMA_NOWAIT|BUS_DMA_ALLOCNOW, &vr->vr_data); if (r != 0) { aprint_error_dev(sc->sc_dev, "%s: bus_dmamap_create data failed, error %d\n", __func__, r); goto cleanup; } r = bus_dmamap_load(virtio_dmat(vsc), vr->vr_control, vr, offsetof(struct vioscsi_req, vr_xs), NULL, BUS_DMA_NOWAIT); if (r != 0) { aprint_error_dev(sc->sc_dev, "%s: bus_dmamap_load ctrl error %d\n", __func__, r); goto cleanup; } } return 0; cleanup: for (; slot > 0; slot--) { vr = &sc->sc_reqs[slot]; if (vr->vr_control) { /* this will also unload the mapping if loaded */ bus_dmamap_destroy(virtio_dmat(vsc), vr->vr_control); vr->vr_control = NULL; } if (vr->vr_data) { bus_dmamap_destroy(virtio_dmat(vsc), vr->vr_data); vr->vr_data = NULL; } } bus_dmamem_unmap(virtio_dmat(vsc), vaddr, allocsize); bus_dmamem_free(virtio_dmat(vsc), &sc->sc_reqs_segs[0], 1); return r; } static void vioscsi_free_reqs(struct vioscsi_softc *sc, struct virtio_softc *vsc) { int slot; struct vioscsi_req *vr; if (sc->sc_nreqs == 0) { /* Not allocated */ return; } /* Free request maps */ for (slot=0; slot < sc->sc_nreqs; slot++) { vr = &sc->sc_reqs[slot]; bus_dmamap_destroy(virtio_dmat(vsc), vr->vr_control); bus_dmamap_destroy(virtio_dmat(vsc), vr->vr_data); } bus_dmamem_unmap(virtio_dmat(vsc), sc->sc_reqs, sc->sc_nreqs * sizeof(struct vioscsi_req)); bus_dmamem_free(virtio_dmat(vsc), &sc->sc_reqs_segs[0], 1); } MODULE(MODULE_CLASS_DRIVER, vioscsi, "virtio"); #ifdef _MODULE #include "ioconf.c" #endif static int vioscsi_modcmd(modcmd_t cmd, void *opaque) { int error = 0; #ifdef _MODULE switch (cmd) { case MODULE_CMD_INIT: error = config_init_component(cfdriver_ioconf_vioscsi, cfattach_ioconf_vioscsi, cfdata_ioconf_vioscsi); break; case MODULE_CMD_FINI: error = config_fini_component(cfdriver_ioconf_vioscsi, cfattach_ioconf_vioscsi, cfdata_ioconf_vioscsi); break; default: error = ENOTTY; break; } #endif return error; }
5 5 5 5 6 6 6 6 6 8 1 7 2 2 2 27 26 72 72 28 72 71 70 72 71 71 71 71 14 13 3 3 11 14 14 14 3 11 14 10 10 10 10 10 10 10 9 1 1 1 1 1 1 1 1 1 1 1 9 9 2 2 2 9 4 1 1 1 9 8 7 2 2 2 2 2 1 1 2 2 2 1 1 2 2 7 1 1 1 1 1 10 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 6 6 6 6 6 6 6 6 6 1 6 5 5 5 5 5 5 5 5 5 5 6 1 1 6 7 7 7 20 21 21 21 8 8 21 21 21 8 8 21 3 3 3 1 1 1 2 1 1 1 2 2 1 4 16 5 5 5 1 16 16 14 14 14 14 14 14 14 14 1 14 14 14 14 13 14 2 2 2 2 1 1 2 1 2 67 1 1 65 67 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 /* $NetBSD: ufs_vnops.c,v 1.262 2022/03/27 16:24:59 christos Exp $ */ /*- * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_vnops.c 8.28 (Berkeley) 7/31/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.262 2022/03/27 16:24:59 christos Exp $"); #if defined(_KERNEL_OPT) #include "opt_ffs.h" #include "opt_quota.h" #include "opt_uvmhist.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/namei.h> #include <sys/resourcevar.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/buf.h> #include <sys/proc.h> #include <sys/mount.h> #include <sys/vnode.h> #include <sys/fstrans.h> #include <sys/kmem.h> #include <sys/malloc.h> #include <sys/dirent.h> #include <sys/lockf.h> #include <sys/kauth.h> #include <sys/wapbl.h> #include <miscfs/specfs/specdev.h> #include <miscfs/fifofs/fifo.h> #include <miscfs/genfs/genfs.h> #include <ufs/ufs/acl.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/dir.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/ufs_bswap.h> #include <ufs/ufs/ufs_extern.h> #include <ufs/ufs/ufs_wapbl.h> #ifdef UFS_DIRHASH #include <ufs/ufs/dirhash.h> #endif #include <ufs/ext2fs/ext2fs_extern.h> #include <ufs/ext2fs/ext2fs_dir.h> #include <ufs/ffs/ffs_extern.h> #include <ufs/lfs/lfs_extern.h> #include <ufs/lfs/lfs.h> #ifdef UVMHIST #include <uvm/uvm.h> #endif #include <uvm/uvm_extern.h> #include <uvm/uvm_stat.h> __CTASSERT(EXT2FS_MAXNAMLEN == FFS_MAXNAMLEN); __CTASSERT(LFS_MAXNAMLEN == FFS_MAXNAMLEN); static int ufs_chmod(struct vnode *, int, kauth_cred_t, struct lwp *); static int ufs_chown(struct vnode *, uid_t, gid_t, kauth_cred_t, struct lwp *); static int ufs_makeinode(struct vattr *, struct vnode *, const struct ufs_lookup_results *, struct vnode **, struct componentname *); /* * A virgin directory (no blushing please). */ static const struct dirtemplate mastertemplate = { 0, 12, DT_DIR, 1, ".", 0, UFS_DIRBLKSIZ - 12, DT_DIR, 2, ".." }; /* * Create a regular file */ int ufs_create(void *v) { struct vop_create_v3_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap = v; int error; struct vnode *dvp = ap->a_dvp; struct ufs_lookup_results *ulr; /* XXX should handle this material another way */ ulr = &VTOI(dvp)->i_crap; UFS_CHECK_CRAPCOUNTER(VTOI(dvp)); /* * UFS_WAPBL_BEGIN(dvp->v_mount) performed by successful * ufs_makeinode */ error = ufs_makeinode(ap->a_vap, dvp, ulr, ap->a_vpp, ap->a_cnp); if (error) { return (error); } UFS_WAPBL_END(dvp->v_mount); VOP_UNLOCK(*ap->a_vpp); return (0); } /* * Mknod vnode call */ /* ARGSUSED */ int ufs_mknod(void *v) { struct vop_mknod_v3_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap = v; struct vattr *vap; struct vnode **vpp; struct inode *ip; int error; struct ufs_lookup_results *ulr; vap = ap->a_vap; vpp = ap->a_vpp; /* XXX should handle this material another way */ ulr = &VTOI(ap->a_dvp)->i_crap; UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp)); /* * UFS_WAPBL_BEGIN(dvp->v_mount) performed by successful * ufs_makeinode */ if ((error = ufs_makeinode(vap, ap->a_dvp, ulr, vpp, ap->a_cnp)) != 0) goto out; ip = VTOI(*vpp); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; UFS_WAPBL_UPDATE(*vpp, NULL, NULL, 0); UFS_WAPBL_END(ap->a_dvp->v_mount); VOP_UNLOCK(*vpp); out: if (error != 0) { *vpp = NULL; return (error); } return (0); } /* * Open called. * * Nothing to do. */ /* ARGSUSED */ int ufs_open(void *v) { struct vop_open_args /* { struct vnode *a_vp; int a_mode; kauth_cred_t a_cred; } */ *ap = v; /* * Files marked append-only must be opened for appending. */ if ((VTOI(ap->a_vp)->i_flags & APPEND) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); return (0); } /* * Close called. * * Update the times on the inode. */ /* ARGSUSED */ int ufs_close(void *v) { struct vop_close_args /* { struct vnode *a_vp; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp; vp = ap->a_vp; if (vrefcnt(vp) > 1) UFS_ITIMES(vp, NULL, NULL, NULL); return (0); } static int ufs_check_possible(struct vnode *vp, struct inode *ip, accmode_t accmode, kauth_cred_t cred) { #if defined(QUOTA) || defined(QUOTA2) int error; #endif /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (accmode & VMODIFY_PERMS) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; #if defined(QUOTA) || defined(QUOTA2) error = chkdq(ip, 0, cred, 0); if (error != 0) return error; #endif break; case VBAD: case VBLK: case VCHR: case VSOCK: case VFIFO: case VNON: default: break; } } /* If it is a snapshot, nobody gets access to it. */ if ((ip->i_flags & SF_SNAPSHOT)) return EPERM; /* * If immutable bit set, nobody gets to write it. "& ~VADMIN_PERMS" * permits the owner of the file to remove the IMMUTABLE flag. */ if ((accmode & (VMODIFY_PERMS & ~VADMIN_PERMS)) && (ip->i_flags & IMMUTABLE)) return EPERM; return 0; } static int ufs_check_permitted(struct vnode *vp, struct inode *ip, struct acl *acl, accmode_t accmode, kauth_cred_t cred, int (*func)(struct vnode *, kauth_cred_t, uid_t, gid_t, mode_t, struct acl *, accmode_t)) { return kauth_authorize_vnode(cred, KAUTH_ACCESS_ACTION(accmode, vp->v_type, ip->i_mode & ALLPERMS), vp, NULL, (*func)(vp, cred, ip->i_uid, ip->i_gid, ip->i_mode & ALLPERMS, acl, accmode)); } int ufs_accessx(void *v) { struct vop_accessx_args /* { struct vnode *a_vp; accmode_t a_accmode; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); accmode_t accmode = ap->a_accmode; int error; #ifdef UFS_ACL struct acl *acl; acl_type_t type; #endif error = ufs_check_possible(vp, ip, accmode, ap->a_cred); if (error) return error; #ifdef UFS_ACL if ((vp->v_mount->mnt_flag & (MNT_POSIX1EACLS | MNT_NFS4ACLS)) != 0) { if (vp->v_mount->mnt_flag & MNT_NFS4ACLS) type = ACL_TYPE_NFS4; else type = ACL_TYPE_ACCESS; acl = acl_alloc(KM_SLEEP); if (type == ACL_TYPE_NFS4) error = ufs_getacl_nfs4_internal(vp, acl, curlwp); else error = VOP_GETACL(vp, type, acl, ap->a_cred); if (!error) { if (type == ACL_TYPE_NFS4) { error = ufs_check_permitted(vp, ip, acl, accmode, ap->a_cred, genfs_can_access_acl_nfs4); } else { error = vfs_unixify_accmode(&accmode); if (error == 0) error = ufs_check_permitted(vp, ip, acl, accmode, ap->a_cred, genfs_can_access_acl_posix1e); } acl_free(acl); return error; } if (error != EOPNOTSUPP) printf("%s: Error retrieving ACL: %d\n", __func__, error); /* * XXX: Fall back until debugged. Should * eventually possibly log an error, and return * EPERM for safety. */ acl_free(acl); } #endif /* !UFS_ACL */ error = vfs_unixify_accmode(&accmode); if (error) return error; return ufs_check_permitted(vp, ip, NULL, accmode, ap->a_cred, genfs_can_access); } /* ARGSUSED */ int ufs_getattr(void *v) { struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp; struct inode *ip; struct vattr *vap; vp = ap->a_vp; ip = VTOI(vp); vap = ap->a_vap; UFS_ITIMES(vp, NULL, NULL, NULL); /* * Copy from inode table */ vap->va_fsid = ip->i_dev; vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode & ALLPERMS; vap->va_nlink = ip->i_nlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; vap->va_size = vp->v_size; if (ip->i_ump->um_fstype == UFS1) { switch (vp->v_type) { case VBLK: case VCHR: vap->va_rdev = (dev_t)ufs_rw32(ip->i_ffs1_rdev, UFS_MPNEEDSWAP(ip->i_ump)); break; default: vap->va_rdev = NODEV; break; } vap->va_atime.tv_sec = ip->i_ffs1_atime; vap->va_atime.tv_nsec = ip->i_ffs1_atimensec; vap->va_mtime.tv_sec = ip->i_ffs1_mtime; vap->va_mtime.tv_nsec = ip->i_ffs1_mtimensec; vap->va_ctime.tv_sec = ip->i_ffs1_ctime; vap->va_ctime.tv_nsec = ip->i_ffs1_ctimensec; vap->va_birthtime.tv_sec = 0; vap->va_birthtime.tv_nsec = 0; vap->va_bytes = dbtob((u_quad_t)ip->i_ffs1_blocks); } else { switch (vp->v_type) { case VBLK: case VCHR: vap->va_rdev = (dev_t)ufs_rw64(ip->i_ffs2_rdev, UFS_MPNEEDSWAP(ip->i_ump)); break; default: vap->va_rdev = NODEV; break; } vap->va_atime.tv_sec = ip->i_ffs2_atime; vap->va_atime.tv_nsec = ip->i_ffs2_atimensec; vap->va_mtime.tv_sec = ip->i_ffs2_mtime; vap->va_mtime.tv_nsec = ip->i_ffs2_mtimensec; vap->va_ctime.tv_sec = ip->i_ffs2_ctime; vap->va_ctime.tv_nsec = ip->i_ffs2_ctimensec; vap->va_birthtime.tv_sec = ip->i_ffs2_birthtime; vap->va_birthtime.tv_nsec = ip->i_ffs2_birthnsec; vap->va_bytes = dbtob(ip->i_ffs2_blocks); } vap->va_gen = ip->i_gen; vap->va_flags = ip->i_flags; /* this doesn't belong here */ if (vp->v_type == VBLK) vap->va_blocksize = BLKDEV_IOSIZE; else if (vp->v_type == VCHR) vap->va_blocksize = MAXBSIZE; else vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; vap->va_type = vp->v_type; vap->va_filerev = ip->i_modrev; return (0); } /* * Set attribute vnode op. called from several syscalls */ int ufs_setattr(void *v) { struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; } */ *ap = v; struct vattr *vap; struct vnode *vp; struct inode *ip; kauth_cred_t cred; struct lwp *l; int error; kauth_action_t action; bool changing_sysflags; vap = ap->a_vap; vp = ap->a_vp; ip = VTOI(vp); cred = ap->a_cred; l = curlwp; action = KAUTH_VNODE_WRITE_FLAGS; changing_sysflags = false; /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } UFS_WAPBL_JUNLOCK_ASSERT(vp->v_mount); if (vap->va_flags != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) { error = EROFS; goto out; } /* Snapshot flag cannot be set or cleared */ if ((vap->va_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) != (ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))) { error = EPERM; goto out; } if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) { action |= KAUTH_VNODE_HAS_SYSFLAGS; } if ((vap->va_flags & SF_SETTABLE) != (ip->i_flags & SF_SETTABLE)) { action |= KAUTH_VNODE_WRITE_SYSFLAGS; changing_sysflags = true; } error = kauth_authorize_vnode(cred, action, vp, NULL, genfs_can_chflags(vp, cred, ip->i_uid, changing_sysflags)); if (error) goto out; if (changing_sysflags) { error = UFS_WAPBL_BEGIN(vp->v_mount); if (error) goto out; ip->i_flags = vap->va_flags; DIP_ASSIGN(ip, flags, ip->i_flags); } else { error = UFS_WAPBL_BEGIN(vp->v_mount); if (error) goto out; ip->i_flags &= SF_SETTABLE; ip->i_flags |= (vap->va_flags & UF_SETTABLE); DIP_ASSIGN(ip, flags, ip->i_flags); } ip->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); UFS_WAPBL_END(vp->v_mount); if (vap->va_flags & (IMMUTABLE | APPEND)) { error = 0; goto out; } } if (ip->i_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } /* * Go through the fields and update iff not VNOVAL. */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) { error = EROFS; goto out; } error = UFS_WAPBL_BEGIN(vp->v_mount); if (error) goto out; error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, l); UFS_WAPBL_END(vp->v_mount); if (error) goto out; } if (vap->va_size != VNOVAL) { /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ switch (vp->v_type) { case VDIR: error = EISDIR; goto out; case VCHR: case VBLK: case VFIFO: break; case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) { error = EROFS; goto out; } if ((ip->i_flags & SF_SNAPSHOT) != 0) { error = EPERM; goto out; } error = ufs_truncate_retry(vp, 0, vap->va_size, cred); if (error) goto out; break; default: error = EOPNOTSUPP; goto out; } } ip = VTOI(vp); if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_birthtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) { error = EROFS; goto out; } if ((ip->i_flags & SF_SNAPSHOT) != 0) { error = EPERM; goto out; } error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_TIMES, vp, NULL, genfs_can_chtimes(vp, cred, ip->i_uid, vap->va_vaflags)); if (error) goto out; error = UFS_WAPBL_BEGIN(vp->v_mount); if (error) goto out; if (vap->va_atime.tv_sec != VNOVAL) if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) ip->i_flag |= IN_ACCESS; if (vap->va_mtime.tv_sec != VNOVAL) { ip->i_flag |= IN_CHANGE | IN_UPDATE; if (vp->v_mount->mnt_flag & MNT_RELATIME) ip->i_flag |= IN_ACCESS; } if (vap->va_birthtime.tv_sec != VNOVAL && ip->i_ump->um_fstype == UFS2) { ip->i_ffs2_birthtime = vap->va_birthtime.tv_sec; ip->i_ffs2_birthnsec = vap->va_birthtime.tv_nsec; } error = UFS_UPDATE(vp, &vap->va_atime, &vap->va_mtime, 0); UFS_WAPBL_END(vp->v_mount); if (error) goto out; } error = 0; if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) { error = EROFS; goto out; } if ((ip->i_flags & SF_SNAPSHOT) != 0 && (vap->va_mode & (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP | S_IXOTH | S_IWOTH))) { error = EPERM; goto out; } error = UFS_WAPBL_BEGIN(vp->v_mount); if (error) goto out; error = ufs_chmod(vp, (int)vap->va_mode, cred, l); UFS_WAPBL_END(vp->v_mount); } out: cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid, !HAS_ACLS(ip)); return (error); } #ifdef UFS_ACL static int ufs_update_nfs4_acl_after_mode_change(struct vnode *vp, int mode, int file_owner_id, kauth_cred_t cred, struct lwp *l) { int error; struct acl *aclp; aclp = acl_alloc(KM_SLEEP); error = ufs_getacl_nfs4_internal(vp, aclp, l); /* * We don't have to handle EOPNOTSUPP here, as the filesystem claims * it supports ACLs. */ if (error) goto out; acl_nfs4_sync_acl_from_mode(aclp, mode, file_owner_id); error = ufs_setacl_nfs4_internal(vp, aclp, l, false); out: acl_free(aclp); return (error); } #endif /* UFS_ACL */ /* * Change the mode on a file. * Inode must be locked before calling. */ static int ufs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct lwp *l) { struct inode *ip; int error; UFS_WAPBL_JLOCK_ASSERT(vp->v_mount); ip = VTOI(vp); #ifdef UFS_ACL /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESSX(vp, VWRITE_ACL, cred)) != 0) return error; #endif error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_SECURITY, vp, NULL, genfs_can_chmod(vp, cred, ip->i_uid, ip->i_gid, mode)); if (error) return (error); #ifdef UFS_ACL if ((vp->v_mount->mnt_flag & MNT_NFS4ACLS) != 0) { error = ufs_update_nfs4_acl_after_mode_change(vp, mode, ip->i_uid, cred, l); if (error) return error; } #endif ip->i_mode &= ~ALLPERMS; ip->i_mode |= (mode & ALLPERMS); ip->i_flag |= IN_CHANGE; DIP_ASSIGN(ip, mode, ip->i_mode); UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid, !HAS_ACLS(ip)); return (0); } /* * Perform chown operation on inode ip; * inode must be locked prior to call. */ static int ufs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred, struct lwp *l) { struct inode *ip; int error = 0; #if defined(QUOTA) || defined(QUOTA2) uid_t ouid; gid_t ogid; int64_t change; #endif ip = VTOI(vp); error = 0; if (uid == (uid_t)VNOVAL) uid = ip->i_uid; if (gid == (gid_t)VNOVAL) gid = ip->i_gid; #ifdef UFS_ACL /* * To modify the ownership of a file, must possess VADMIN for that * file. */ if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred)) != 0) return error; #endif error = kauth_authorize_vnode(cred, KAUTH_VNODE_CHANGE_OWNERSHIP, vp, NULL, genfs_can_chown(vp, cred, ip->i_uid, ip->i_gid, uid, gid)); if (error) return (error); #if defined(QUOTA) || defined(QUOTA2) ogid = ip->i_gid; ouid = ip->i_uid; change = DIP(ip, blocks); (void) chkdq(ip, -change, cred, 0); (void) chkiq(ip, -1, cred, 0); #endif ip->i_gid = gid; DIP_ASSIGN(ip, gid, gid); ip->i_uid = uid; DIP_ASSIGN(ip, uid, uid); #if defined(QUOTA) || defined(QUOTA2) if ((error = chkdq(ip, change, cred, 0)) == 0) { if ((error = chkiq(ip, 1, cred, 0)) == 0) goto good; else (void) chkdq(ip, -change, cred, FORCE); } ip->i_gid = ogid; DIP_ASSIGN(ip, gid, ogid); ip->i_uid = ouid; DIP_ASSIGN(ip, uid, ouid); (void) chkdq(ip, change, cred, FORCE); (void) chkiq(ip, 1, cred, FORCE); return (error); good: #endif /* QUOTA || QUOTA2 */ ip->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid, !HAS_ACLS(ip)); return (0); } int ufs_remove(void *v) { struct vop_remove_v3_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; nlink_t ctx_vp_new_nlink; } */ *ap = v; struct vnode *vp, *dvp; struct inode *ip; struct mount *mp; int error; struct ufs_lookup_results *ulr; vp = ap->a_vp; dvp = ap->a_dvp; ip = VTOI(vp); mp = dvp->v_mount; KASSERT(mp == vp->v_mount); /* XXX Not stable without lock. */ #ifdef UFS_ACL #ifdef notyet /* We don't do this because if the filesystem is mounted without ACLs * this goes through vfs_unixify_accmode() and we get EPERM. */ error = VOP_ACCESSX(vp, VDELETE, ap->a_cnp->cn_cred); if (error) goto err; #endif #endif /* XXX should handle this material another way */ ulr = &VTOI(dvp)->i_crap; UFS_CHECK_CRAPCOUNTER(VTOI(dvp)); if (vp->v_type == VDIR || (ip->i_flags & (IMMUTABLE | APPEND)) || (VTOI(dvp)->i_flags & APPEND)) error = EPERM; else { error = UFS_WAPBL_BEGIN(mp); if (error == 0) { error = ufs_dirremove(dvp, ulr, ip, ap->a_cnp->cn_flags, 0); UFS_WAPBL_END(mp); if (error == 0) { ap->ctx_vp_new_nlink = ip->i_nlink; } } } #ifdef notyet err: #endif if (dvp == vp) vrele(vp); else vput(vp); return (error); } /* * ufs_link: create hard link. */ int ufs_link(void *v) { struct vop_link_v2_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap = v; struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct componentname *cnp = ap->a_cnp; struct mount *mp = dvp->v_mount; struct inode *ip; struct direct *newdir; int error, abrt = 1; struct ufs_lookup_results *ulr; KASSERT(dvp != vp); KASSERT(vp->v_type != VDIR); KASSERT(mp == vp->v_mount); /* XXX Not stable without lock. */ /* XXX should handle this material another way */ ulr = &VTOI(dvp)->i_crap; UFS_CHECK_CRAPCOUNTER(VTOI(dvp)); error = vn_lock(vp, LK_EXCLUSIVE); if (error) goto out2; ip = VTOI(vp); if ((nlink_t)ip->i_nlink >= LINK_MAX) { error = EMLINK; goto out1; } if (ip->i_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out1; } error = kauth_authorize_vnode(cnp->cn_cred, KAUTH_VNODE_ADD_LINK, vp, dvp, 0); if (error) goto out1; error = UFS_WAPBL_BEGIN(mp); if (error) goto out1; ip->i_nlink++; DIP_ASSIGN(ip, nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; abrt = 0; error = UFS_UPDATE(vp, NULL, NULL, UPDATE_DIROP); if (!error) { newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK); ufs_makedirentry(ip, cnp, newdir); error = ufs_direnter(dvp, ulr, vp, newdir, cnp, NULL); pool_cache_put(ufs_direct_cache, newdir); } if (error) { ip->i_nlink--; DIP_ASSIGN(ip, nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(vp, NULL, NULL, UPDATE_DIROP); } UFS_WAPBL_END(mp); out1: VOP_UNLOCK(vp); out2: if (abrt) VOP_ABORTOP(dvp, cnp); return (error); } /* * whiteout vnode call */ int ufs_whiteout(void *v) { struct vop_whiteout_args /* { struct vnode *a_dvp; struct componentname *a_cnp; int a_flags; } */ *ap = v; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct direct *newdir; int error; struct ufsmount *ump = VFSTOUFS(dvp->v_mount); struct ufs_lookup_results *ulr; /* XXX should handle this material another way */ ulr = &VTOI(dvp)->i_crap; UFS_CHECK_CRAPCOUNTER(VTOI(dvp)); error = 0; switch (ap->a_flags) { case LOOKUP: /* 4.4 format directories support whiteout operations */ if (ump->um_maxsymlinklen > 0) return (0); return (EOPNOTSUPP); case CREATE: /* create a new directory whiteout */ error = UFS_WAPBL_BEGIN(dvp->v_mount); if (error) break; KASSERTMSG((ump->um_maxsymlinklen > 0), "ufs_whiteout: old format filesystem"); newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK); newdir->d_ino = UFS_WINO; newdir->d_namlen = cnp->cn_namelen; memcpy(newdir->d_name, cnp->cn_nameptr, (size_t)cnp->cn_namelen); /* NUL terminate and zero out padding */ memset(&newdir->d_name[cnp->cn_namelen], 0, UFS_NAMEPAD(cnp->cn_namelen)); newdir->d_type = DT_WHT; error = ufs_direnter(dvp, ulr, NULL, newdir, cnp, NULL); pool_cache_put(ufs_direct_cache, newdir); break; case DELETE: /* remove an existing directory whiteout */ error = UFS_WAPBL_BEGIN(dvp->v_mount); if (error) break; KASSERTMSG((ump->um_maxsymlinklen > 0), "ufs_whiteout: old format filesystem"); cnp->cn_flags &= ~DOWHITEOUT; error = ufs_dirremove(dvp, ulr, NULL, cnp->cn_flags, 0); break; default: panic("ufs_whiteout: unknown op"); /* NOTREACHED */ } UFS_WAPBL_END(dvp->v_mount); return (error); } #ifdef UFS_ACL static int ufs_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp, mode_t dmode, kauth_cred_t cred, struct lwp *l) { int error; struct inode *ip = VTOI(tvp); struct acl *dacl, *acl; acl = acl_alloc(KM_SLEEP); dacl = acl_alloc(KM_SLEEP); /* * Retrieve default ACL from parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. If the ACL is empty, fall through to * the "not defined or available" case. */ if (acl->acl_cnt != 0) { dmode = acl_posix1e_newfilemode(dmode, acl); ip->i_mode = dmode; DIP_ASSIGN(ip, mode, dmode); *dacl = *acl; ufs_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = dmode; DIP_ASSIGN(ip, mode, dmode); error = 0; goto out; default: goto out; } /* * XXX: If we abort now, will Soft Updates notify the extattr * code that the EAs for the file need to be released? */ UFS_WAPBL_END(tvp->v_mount); error = ufs_setacl_posix1e(tvp, ACL_TYPE_ACCESS, acl, cred, l); if (error == 0) error = ufs_setacl_posix1e(tvp, ACL_TYPE_DEFAULT, dacl, cred, l); UFS_WAPBL_BEGIN(tvp->v_mount); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above * was supposed to free acl. */ printf("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()\n"); /* panic("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()"); */ break; default: goto out; } out: acl_free(acl); acl_free(dacl); return (error); } static int ufs_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp, mode_t mode, kauth_cred_t cred, struct lwp *l) { int error; struct inode *ip = VTOI(tvp); struct acl *acl; acl = acl_alloc(KM_SLEEP); /* * Retrieve default ACL for parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. */ if (acl->acl_cnt != 0) { /* * Two possible ways for default ACL to not * be present. First, the EA can be * undefined, or second, the default ACL can * be blank. If it's blank, fall through to * the it's not defined case. */ mode = acl_posix1e_newfilemode(mode, acl); ip->i_mode = mode; DIP_ASSIGN(ip, mode, mode); ufs_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = mode; DIP_ASSIGN(ip, mode, mode); error = 0; goto out; default: goto out; } UFS_WAPBL_END(tvp->v_mount); /* * XXX: If we abort now, will Soft Updates notify the extattr * code that the EAs for the file need to be released? */ error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred); UFS_WAPBL_BEGIN(tvp->v_mount); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above was * supposed to free acl. */ printf("%s: VOP_GETACL() but no VOP_SETACL()\n", __func__); /* panic("%s: VOP_GETACL() but no VOP_SETACL()", __func__); */ break; default: goto out; } out: acl_free(acl); return (error); } static int ufs_do_nfs4_acl_inheritance(struct vnode *dvp, struct vnode *tvp, mode_t child_mode, kauth_cred_t cred, struct lwp *l) { int error; struct acl *parent_aclp, *child_aclp; parent_aclp = acl_alloc(KM_SLEEP); child_aclp = acl_alloc(KM_SLEEP); error = ufs_getacl_nfs4_internal(dvp, parent_aclp, l); if (error) goto out; acl_nfs4_compute_inherited_acl(parent_aclp, child_aclp, child_mode, VTOI(tvp)->i_uid, tvp->v_type == VDIR); error = ufs_setacl_nfs4_internal(tvp, child_aclp, l, false); if (error) goto out; out: acl_free(parent_aclp); acl_free(child_aclp); return (error); } #endif int ufs_mkdir(void *v) { struct vop_mkdir_v3_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap = v; struct vnode *dvp = ap->a_dvp, *tvp; struct vattr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp = VTOI(dvp); struct buf *bp; struct dirtemplate dirtemplate; struct direct *newdir; int error; struct ufsmount *ump = dp->i_ump; int dirblksiz = ump->um_dirblksiz; struct ufs_lookup_results *ulr; /* XXX should handle this material another way */ ulr = &dp->i_crap; UFS_CHECK_CRAPCOUNTER(dp); KASSERT(vap->va_type == VDIR); if ((nlink_t)dp->i_nlink >= LINK_MAX) { error = EMLINK; goto out; } /* * Must simulate part of ufs_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is * made later after writing "." and ".." entries. */ error = vcache_new(dvp->v_mount, dvp, vap, cnp->cn_cred, NULL, ap->a_vpp); if (error) goto out; error = vn_lock(*ap->a_vpp, LK_EXCLUSIVE); if (error) { vrele(*ap->a_vpp); *ap->a_vpp = NULL; goto out; } error = UFS_WAPBL_BEGIN(ap->a_dvp->v_mount); if (error) { vput(*ap->a_vpp); goto out; } tvp = *ap->a_vpp; ip = VTOI(tvp); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_nlink = 2; DIP_ASSIGN(ip, nlink, 2); if (cnp->cn_flags & ISWHITEOUT) { ip->i_flags |= UF_OPAQUE; DIP_ASSIGN(ip, flags, ip->i_flags); } /* * Bump link count in parent directory to reflect work done below. * Should be done before reference is created so cleanup is * possible if we crash. */ dp->i_nlink++; DIP_ASSIGN(dp, nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; if ((error = UFS_UPDATE(dvp, NULL, NULL, UPDATE_DIROP)) != 0) goto bad; #ifdef UFS_ACL mode_t dmode = (vap->va_mode & 0777) | IFDIR; struct lwp *l = curlwp; if (dvp->v_mount->mnt_flag & MNT_POSIX1EACLS) { error = ufs_do_posix1e_acl_inheritance_dir(dvp, tvp, dmode, cnp->cn_cred, l); if (error) goto bad; } else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) { error = ufs_do_nfs4_acl_inheritance(dvp, tvp, dmode, cnp->cn_cred, l); if (error) goto bad; } #endif /* !UFS_ACL */ /* * Initialize directory with "." and ".." from static template. */ dirtemplate = mastertemplate; dirtemplate.dotdot_reclen = dirblksiz - dirtemplate.dot_reclen; dirtemplate.dot_ino = ufs_rw32(ip->i_number, UFS_MPNEEDSWAP(ump)); dirtemplate.dotdot_ino = ufs_rw32(dp->i_number, UFS_MPNEEDSWAP(ump)); dirtemplate.dot_reclen = ufs_rw16(dirtemplate.dot_reclen, UFS_MPNEEDSWAP(ump)); dirtemplate.dotdot_reclen = ufs_rw16(dirtemplate.dotdot_reclen, UFS_MPNEEDSWAP(ump)); if (ump->um_maxsymlinklen <= 0) { #if BYTE_ORDER == LITTLE_ENDIAN if (UFS_MPNEEDSWAP(ump) == 0) #else if (UFS_MPNEEDSWAP(ump) != 0) #endif { dirtemplate.dot_type = dirtemplate.dot_namlen; dirtemplate.dotdot_type = dirtemplate.dotdot_namlen; dirtemplate.dot_namlen = dirtemplate.dotdot_namlen = 0; } else dirtemplate.dot_type = dirtemplate.dotdot_type = 0; } if ((error = UFS_BALLOC(tvp, (off_t)0, dirblksiz, cnp->cn_cred, B_CLRBUF, &bp)) != 0) goto bad; ip->i_size = dirblksiz; DIP_ASSIGN(ip, size, dirblksiz); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; uvm_vnp_setsize(tvp, ip->i_size); memcpy((void *)bp->b_data, (void *)&dirtemplate, sizeof dirtemplate); /* * Directory set up, now install its entry in the parent directory. * We must write out the buffer containing the new directory body * before entering the new name in the parent. */ if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0) goto bad; if ((error = UFS_UPDATE(tvp, NULL, NULL, UPDATE_DIROP)) != 0) { goto bad; } newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK); ufs_makedirentry(ip, cnp, newdir); error = ufs_direnter(dvp, ulr, tvp, newdir, cnp, bp); pool_cache_put(ufs_direct_cache, newdir); bad: if (error == 0) { VOP_UNLOCK(tvp); UFS_WAPBL_END(dvp->v_mount); } else { dp->i_nlink--; DIP_ASSIGN(dp, nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP); /* * No need to do an explicit UFS_TRUNCATE here, vrele will * do this for us because we set the link count to 0. */ ip->i_nlink = 0; DIP_ASSIGN(ip, nlink, 0); ip->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(tvp, NULL, NULL, UPDATE_DIROP); UFS_WAPBL_END(dvp->v_mount); vput(tvp); } out: return (error); } int ufs_rmdir(void *v) { struct vop_rmdir_v2_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap = v; struct vnode *vp, *dvp; struct componentname *cnp; struct inode *ip, *dp; int error; struct ufs_lookup_results *ulr; vp = ap->a_vp; dvp = ap->a_dvp; cnp = ap->a_cnp; ip = VTOI(vp); dp = VTOI(dvp); #ifdef UFS_ACL #ifdef notyet /* We don't do this because if the filesystem is mounted without ACLs * this goes through vfs_unixify_accmode() and we get EPERM. */ error = VOP_ACCESSX(vp, VDELETE, cnp->cn_cred); if (error) goto err; #endif #endif /* XXX should handle this material another way */ ulr = &dp->i_crap; UFS_CHECK_CRAPCOUNTER(dp); /* * No rmdir "." or of mounted directories please. */ if (dp == ip || vp->v_mountedhere != NULL) { error = EINVAL; goto err; } /* * Do not remove a directory that is in the process of being renamed. * Verify that the directory is empty (and valid). (Rmdir ".." won't * be valid since ".." will contain a reference to the current * directory and thus be non-empty.) */ error = 0; if (ip->i_nlink != 2 || !ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; goto out; } if ((dp->i_flags & APPEND) || (ip->i_flags & (IMMUTABLE | APPEND))) { error = EPERM; goto out; } error = UFS_WAPBL_BEGIN(dvp->v_mount); if (error) goto out; /* * Delete reference to directory before purging * inode. If we crash in between, the directory * will be reattached to lost+found, */ error = ufs_dirremove(dvp, ulr, ip, cnp->cn_flags, 1); if (error) { UFS_WAPBL_END(dvp->v_mount); goto out; } cache_purge(dvp); /* * Truncate inode. The only stuff left in the directory is "." and * "..". The "." reference is inconsequential since we're quashing * it. */ dp->i_nlink--; DIP_ASSIGN(dp, nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP); ip->i_nlink--; DIP_ASSIGN(ip, nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; (void) UFS_TRUNCATE(vp, (off_t)0, IO_SYNC, cnp->cn_cred); cache_purge(vp); /* * Unlock the log while we still have reference to unlinked * directory vp so that it will not get locked for recycling */ UFS_WAPBL_END(dvp->v_mount); #ifdef UFS_DIRHASH if (ip->i_dirhash != NULL) ufsdirhash_free(ip); #endif out: vput(vp); return error; err: if (dp == ip) vrele(vp); else vput(vp); return error; } /* * symlink -- make a symbolic link */ int ufs_symlink(void *v) { struct vop_symlink_v3_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap = v; struct vnode *vp, **vpp; struct inode *ip; int len, error; struct ufs_lookup_results *ulr; vpp = ap->a_vpp; /* XXX should handle this material another way */ ulr = &VTOI(ap->a_dvp)->i_crap; UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp)); /* * UFS_WAPBL_BEGIN(dvp->v_mount) performed by successful * ufs_makeinode */ KASSERT(ap->a_vap->va_type == VLNK); error = ufs_makeinode(ap->a_vap, ap->a_dvp, ulr, vpp, ap->a_cnp); if (error) goto out; vp = *vpp; len = strlen(ap->a_target); ip = VTOI(vp); /* * This test is off by one. um_maxsymlinklen contains the * number of bytes available, and we aren't storing a \0, so * the test should properly be <=. However, it cannot be * changed as this would break compatibility with existing fs * images -- see the way ufs_readlink() works. */ if (len < ip->i_ump->um_maxsymlinklen) { memcpy((char *)SHORTLINK(ip), ap->a_target, len); ip->i_size = len; DIP_ASSIGN(ip, size, len); uvm_vnp_setsize(vp, ip->i_size); ip->i_flag |= IN_CHANGE | IN_UPDATE; if (vp->v_mount->mnt_flag & MNT_RELATIME) ip->i_flag |= IN_ACCESS; UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); } else error = ufs_bufio(UIO_WRITE, vp, ap->a_target, len, (off_t)0, IO_NODELOCKED | IO_JOURNALLOCKED, ap->a_cnp->cn_cred, NULL, NULL); UFS_WAPBL_END(ap->a_dvp->v_mount); VOP_UNLOCK(vp); if (error) vrele(vp); out: return (error); } /* * Vnode op for reading directories. * * This routine handles converting from the on-disk directory format * "struct direct" to the in-memory format "struct dirent" as well as * byte swapping the entries if necessary. */ int ufs_readdir(void *v) { struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; kauth_cred_t a_cred; int *a_eofflag; off_t **a_cookies; int *a_ncookies; } */ *ap = v; /* vnode and fs */ struct vnode *vp = ap->a_vp; struct ufsmount *ump = VFSTOUFS(vp->v_mount); int nswap = UFS_MPNEEDSWAP(ump); #if BYTE_ORDER == LITTLE_ENDIAN int needswap = ump->um_maxsymlinklen <= 0 && nswap == 0; #else int needswap = ump->um_maxsymlinklen <= 0 && nswap != 0; #endif /* caller's buffer */ struct uio *calleruio = ap->a_uio; off_t startoffset, endoffset; size_t callerbytes; off_t curoffset; /* dirent production buffer */ char *direntbuf; size_t direntbufmax; struct dirent *dirent, *stopdirent; /* output cookies array */ off_t *cookies; size_t numcookies, maxcookies; /* disk buffer */ off_t physstart, physend; size_t skipstart, dropend; char *rawbuf; size_t rawbufmax, rawbytes; struct uio rawuio; struct iovec rawiov; struct direct *rawdp, *stoprawdp; /* general */ int error; KASSERT(VOP_ISLOCKED(vp)); /* * Figure out where the user wants us to read and how much. * * XXX: there should probably be an upper bound on callerbytes * to avoid silliness trying to do large kernel allocations. */ callerbytes = calleruio->uio_resid; startoffset = calleruio->uio_offset; endoffset = startoffset + callerbytes; if (callerbytes < _DIRENT_MINSIZE(dirent)) { /* no room for even one struct dirent */ return EINVAL; } /* * Now figure out where to actually start reading. Round the * start down to a block boundary: we need to start at the * beginning of a block in order to read the directory * correctly. * * We also want to always read a whole number of blocks so * that the copying code below doesn't have to worry about * partial entries. (It used to try at one point, and was a * horrible mess.) * * Furthermore, since blocks have to be scanned from the * beginning, if we go partially into another block now we'll * just have to rescan it on the next readdir call, which * doesn't really serve any useful purpose. * * So, round down the end as well. It's ok to underpopulate * the transfer buffer, as long as we send back at least one * dirent so as to avoid giving a bogus EOF indication. * * Note that because dirents are larger than ffs struct * directs, despite the rounding down we may not be able to * send all the entries in the blocks we read and may have to * rescan some of them on the next call anyway. Alternatively * if there's empty space on disk we might have actually been * able to fit the next block in, and so forth. None of this * actually matters that much in practice. * * XXX: what does ffs do if a directory block becomes * completely empty, and what happens if all the blocks we * read are completely empty even though we aren't at EOF? As * of this writing I (dholland) can't remember the details. */ physstart = rounddown2(startoffset, ump->um_dirblksiz); physend = rounddown2(endoffset, ump->um_dirblksiz); if (physstart >= physend) { /* Need at least one block */ return EINVAL; } /* * skipstart is the number of bytes we need to read in * (because we need to start at the beginning of a block) but * not transfer to the user. * * dropend is the number of bytes to ignore at the end of the * user's buffer. */ skipstart = startoffset - physstart; dropend = endoffset - physend; /* * Make a transfer buffer. * * Note: rawbufmax = physend - physstart. Proof: * * physend - physstart = physend - physstart * = physend - physstart + startoffset - startoffset * = physend + (startoffset - physstart) - startoffset * = physend + skipstart - startoffset * = physend + skipstart - startoffset + endoffset - endoffset * = skipstart - startoffset + endoffset - (endoffset - physend) * = skipstart - startoffset + endoffset - dropend * = skipstart - startoffset + (startoffset + callerbytes) - dropend * = skipstart + callerbytes - dropend * = rawbufmax * Qed. * * XXX: this should just use physend - physstart. * * XXX: this should be rewritten to read the directs straight * out of bufferio buffers instead of copying twice. This would * also let us adapt better to the user's buffer size. */ /* Base buffer space for CALLERBYTES of new data */ rawbufmax = callerbytes + skipstart; if (rawbufmax < callerbytes) return EINVAL; rawbufmax -= dropend; if (rawbufmax < _DIRENT_MINSIZE(rawdp)) { /* no room for even one struct direct */ return EINVAL; } /* read it */ rawbuf = kmem_alloc(rawbufmax, KM_SLEEP); rawiov.iov_base = rawbuf; rawiov.iov_len = rawbufmax; rawuio.uio_iov = &rawiov; rawuio.uio_iovcnt = 1; rawuio.uio_offset = physstart; rawuio.uio_resid = rawbufmax; UIO_SETUP_SYSSPACE(&rawuio); rawuio.uio_rw = UIO_READ; error = UFS_BUFRD(vp, &rawuio, 0, ap->a_cred); if (error != 0) { kmem_free(rawbuf, rawbufmax); return error; } rawbytes = rawbufmax - rawuio.uio_resid; /* the raw entries to iterate over */ rawdp = (struct direct *)(void *)rawbuf; stoprawdp = (struct direct *)(void *)&rawbuf[rawbytes]; /* allocate space to produce dirents into */ direntbufmax = callerbytes; direntbuf = kmem_alloc(direntbufmax, KM_SLEEP); /* the dirents to iterate over */ dirent = (struct dirent *)(void *)direntbuf; stopdirent = (struct dirent *)(void *)&direntbuf[direntbufmax]; /* the output "cookies" (seek positions of directory entries) */ if (ap->a_cookies) { numcookies = 0; maxcookies = rawbytes / _DIRENT_RECLEN(rawdp, 1); cookies = malloc(maxcookies * sizeof(*cookies), M_TEMP, M_WAITOK); } else { /* XXX: GCC */ maxcookies = 0; cookies = NULL; } /* now produce the dirents */ curoffset = calleruio->uio_offset; while (rawdp < stoprawdp) { rawdp->d_reclen = ufs_rw16(rawdp->d_reclen, nswap); if (skipstart > 0) { /* drain skipstart */ if (rawdp->d_reclen <= skipstart) { skipstart -= rawdp->d_reclen; rawdp = _DIRENT_NEXT(rawdp); continue; } /* caller's start position wasn't on an entry */ error = EINVAL; goto out; } if (rawdp->d_reclen == 0) { struct dirent *save = dirent; dirent->d_reclen = _DIRENT_MINSIZE(dirent); dirent = _DIRENT_NEXT(dirent); save->d_reclen = 0; rawdp = stoprawdp; break; } /* copy the header */ if (needswap) { dirent->d_type = rawdp->d_namlen; dirent->d_namlen = rawdp->d_type; } else { dirent->d_type = rawdp->d_type; dirent->d_namlen = rawdp->d_namlen; } dirent->d_reclen = _DIRENT_RECLEN(dirent, dirent->d_namlen); /* stop if there isn't room for the name AND another header */ if ((char *)(void *)dirent + dirent->d_reclen + _DIRENT_MINSIZE(dirent) > (char *)(void *)stopdirent) break; /* copy the name (and inode (XXX: why after the test?)) */ dirent->d_fileno = ufs_rw32(rawdp->d_ino, nswap); (void)memcpy(dirent->d_name, rawdp->d_name, dirent->d_namlen); memset(&dirent->d_name[dirent->d_namlen], 0, dirent->d_reclen - _DIRENT_NAMEOFF(dirent) - dirent->d_namlen); /* onward */ curoffset += rawdp->d_reclen; if (ap->a_cookies) { KASSERT(numcookies < maxcookies); cookies[numcookies++] = curoffset; } dirent = _DIRENT_NEXT(dirent); rawdp = _DIRENT_NEXT(rawdp); } /* transfer the dirents to the caller's buffer */ callerbytes = ((char *)(void *)dirent - direntbuf); error = uiomove(direntbuf, callerbytes, calleruio); out: calleruio->uio_offset = curoffset; if (ap->a_cookies) { if (error) { free(cookies, M_TEMP); *ap->a_cookies = NULL; *ap->a_ncookies = 0; } else { *ap->a_cookies = cookies; *ap->a_ncookies = numcookies; } } kmem_free(direntbuf, direntbufmax); kmem_free(rawbuf, rawbufmax); *ap->a_eofflag = VTOI(vp)->i_size <= calleruio->uio_offset; return error; } /* * Return target name of a symbolic link */ int ufs_readlink(void *v) { struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct ufsmount *ump = VFSTOUFS(vp->v_mount); int isize; /* * The test against um_maxsymlinklen is off by one; it should * theoretically be <=, not <. However, it cannot be changed * as that would break compatibility with existing fs images. */ isize = ip->i_size; if (isize < ump->um_maxsymlinklen || (ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0)) { uiomove((char *)SHORTLINK(ip), isize, ap->a_uio); return (0); } return (UFS_BUFRD(vp, ap->a_uio, 0, ap->a_cred)); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. */ int ufs_strategy(void *v) { struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap = v; struct buf *bp; struct vnode *vp; struct inode *ip; struct mount *mp; int error; bp = ap->a_bp; vp = ap->a_vp; ip = VTOI(vp); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("ufs_strategy: spec"); KASSERT(fstrans_held(vp->v_mount)); KASSERT(bp->b_bcount != 0); if (bp->b_blkno == bp->b_lblkno) { error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL); if (error) { bp->b_error = error; biodone(bp); return (error); } if (bp->b_blkno == -1) /* no valid data */ clrbuf(bp); } if (bp->b_blkno < 0) { /* block is not on disk */ biodone(bp); return (0); } vp = ip->i_devvp; error = VOP_STRATEGY(vp, bp); if (error) return error; if (!BUF_ISREAD(bp)) return 0; mp = wapbl_vptomp(vp); if (mp == NULL || mp->mnt_wapbl_replay == NULL || !WAPBL_REPLAY_ISOPEN(mp) || !WAPBL_REPLAY_CAN_READ(mp, bp->b_blkno, bp->b_bcount)) return 0; error = biowait(bp); if (error) return error; error = WAPBL_REPLAY_READ(mp, bp->b_data, bp->b_blkno, bp->b_bcount); if (error) { mutex_enter(&bufcache_lock); SET(bp->b_cflags, BC_INVAL); mutex_exit(&bufcache_lock); } return error; } /* * Print out the contents of an inode. */ int ufs_print(void *v) { struct vop_print_args /* { struct vnode *a_vp; } */ *ap = v; struct vnode *vp; struct inode *ip; vp = ap->a_vp; ip = VTOI(vp); printf("tag VT_UFS, ino %llu, on dev %llu, %llu", (unsigned long long)ip->i_number, (unsigned long long)major(ip->i_dev), (unsigned long long)minor(ip->i_dev)); printf(" flags 0x%x, nlink %d\n", ip->i_flag, ip->i_nlink); printf("\tmode 0%o, owner %d, group %d, size %qd", ip->i_mode, ip->i_uid, ip->i_gid, (long long)ip->i_size); if (vp->v_type == VFIFO) VOCALL(fifo_vnodeop_p, VOFFSET(vop_print), v); printf("\n"); return (0); } /* * Read wrapper for special devices. */ int ufsspec_read(void *v) { struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; /* * Set access flag. */ if ((ap->a_vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0) VTOI(ap->a_vp)->i_flag |= IN_ACCESS; return (VOCALL (spec_vnodeop_p, VOFFSET(vop_read), ap)); } /* * Write wrapper for special devices. */ int ufsspec_write(void *v) { struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; /* * Set update and change flags. */ if ((ap->a_vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0) VTOI(ap->a_vp)->i_flag |= IN_MODIFY; return (VOCALL (spec_vnodeop_p, VOFFSET(vop_write), ap)); } /* * Close wrapper for special devices. * * Update the times on the inode then do device close. */ int ufsspec_close(void *v) { struct vop_close_args /* { struct vnode *a_vp; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp; vp = ap->a_vp; if (vrefcnt(vp) > 1) UFS_ITIMES(vp, NULL, NULL, NULL); return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap)); } /* * Read wrapper for fifo's */ int ufsfifo_read(void *v) { struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; /* * Set access flag. */ VTOI(ap->a_vp)->i_flag |= IN_ACCESS; return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_read), ap)); } /* * Write wrapper for fifo's. */ int ufsfifo_write(void *v) { struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; kauth_cred_t a_cred; } */ *ap = v; /* * Set update and change flags. */ VTOI(ap->a_vp)->i_flag |= IN_MODIFY; return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_write), ap)); } /* * Close wrapper for fifo's. * * Update the times on the inode then do device close. */ int ufsfifo_close(void *v) { struct vop_close_args /* { struct vnode *a_vp; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp; vp = ap->a_vp; if (vrefcnt(ap->a_vp) > 1) UFS_ITIMES(vp, NULL, NULL, NULL); return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap)); } /* * Return POSIX pathconf information applicable to ufs filesystems. */ int ufs_pathconf(void *v) { struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap = v; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return (0); case _PC_NAME_MAX: *ap->a_retval = FFS_MAXNAMLEN; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 1; return (0); #ifdef UFS_ACL case _PC_ACL_EXTENDED: if (ap->a_vp->v_mount->mnt_flag & MNT_POSIX1EACLS) *ap->a_retval = 1; else *ap->a_retval = 0; return 0; case _PC_ACL_NFS4: if (ap->a_vp->v_mount->mnt_flag & MNT_NFS4ACLS) *ap->a_retval = 1; else *ap->a_retval = 0; return 0; #endif case _PC_ACL_PATH_MAX: #ifdef UFS_ACL if (ap->a_vp->v_mount->mnt_flag & (MNT_POSIX1EACLS | MNT_NFS4ACLS)) *ap->a_retval = ACL_MAX_ENTRIES; else *ap->a_retval = 3; #else *ap->a_retval = 3; #endif return 0; case _PC_SYNC_IO: *ap->a_retval = 1; return (0); case _PC_FILESIZEBITS: *ap->a_retval = 42; return (0); case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; return (0); case _PC_2_SYMLINKS: *ap->a_retval = 1; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Advisory record locking support */ int ufs_advlock(void *v) { struct vop_advlock_args /* { struct vnode *a_vp; void * a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap = v; struct inode *ip; ip = VTOI(ap->a_vp); return lf_advlock(ap, &ip->i_lockf, ip->i_size); } /* * Initialize the vnode associated with a new inode, handle aliased * vnodes. */ void ufs_vinit(struct mount *mntp, int (**specops)(void *), int (**fifoops)(void *), struct vnode **vpp) { struct timeval tv; struct inode *ip; struct vnode *vp; dev_t rdev; struct ufsmount *ump; vp = *vpp; ip = VTOI(vp); switch(vp->v_type = IFTOVT(ip->i_mode)) { case VCHR: case VBLK: vp->v_op = specops; ump = ip->i_ump; if (ump->um_fstype == UFS1) rdev = (dev_t)ufs_rw32(ip->i_ffs1_rdev, UFS_MPNEEDSWAP(ump)); else rdev = (dev_t)ufs_rw64(ip->i_ffs2_rdev, UFS_MPNEEDSWAP(ump)); spec_node_init(vp, rdev); break; case VFIFO: vp->v_op = fifoops; break; case VNON: case VBAD: case VSOCK: case VLNK: case VDIR: case VREG: break; } if (ip->i_number == UFS_ROOTINO) vp->v_vflag |= VV_ROOT; /* * Initialize modrev times */ getmicrouptime(&tv); ip->i_modrev = (uint64_t)(uint)tv.tv_sec << 32 | tv.tv_usec * 4294u; *vpp = vp; } /* * Allocate a new inode. */ static int ufs_makeinode(struct vattr *vap, struct vnode *dvp, const struct ufs_lookup_results *ulr, struct vnode **vpp, struct componentname *cnp) { struct inode *ip; struct direct *newdir; struct vnode *tvp; int error; UFS_WAPBL_JUNLOCK_ASSERT(dvp->v_mount); error = vcache_new(dvp->v_mount, dvp, vap, cnp->cn_cred, NULL, &tvp); if (error) return error; error = vn_lock(tvp, LK_EXCLUSIVE); if (error) { vrele(tvp); return error; } *vpp = tvp; ip = VTOI(tvp); error = UFS_WAPBL_BEGIN(dvp->v_mount); if (error) { vput(tvp); return (error); } ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_nlink = 1; DIP_ASSIGN(ip, nlink, 1); /* Authorize setting SGID if needed. */ if (ip->i_mode & ISGID) { error = kauth_authorize_vnode(cnp->cn_cred, KAUTH_VNODE_WRITE_SECURITY, tvp, NULL, genfs_can_chmod(tvp, cnp->cn_cred, ip->i_uid, ip->i_gid, MAKEIMODE(vap->va_type, vap->va_mode))); if (error) { ip->i_mode &= ~ISGID; DIP_ASSIGN(ip, mode, ip->i_mode); } } if (cnp->cn_flags & ISWHITEOUT) { ip->i_flags |= UF_OPAQUE; DIP_ASSIGN(ip, flags, ip->i_flags); } /* * Make sure inode goes to disk before directory entry. */ if ((error = UFS_UPDATE(tvp, NULL, NULL, UPDATE_DIROP)) != 0) goto bad; #ifdef UFS_ACL struct lwp *l = curlwp; if (dvp->v_mount->mnt_flag & MNT_POSIX1EACLS) { error = ufs_do_posix1e_acl_inheritance_file(dvp, tvp, ip->i_mode, cnp->cn_cred, l); if (error) goto bad; } else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) { error = ufs_do_nfs4_acl_inheritance(dvp, tvp, ip->i_mode, cnp->cn_cred, l); if (error) goto bad; } #endif /* !UFS_ACL */ newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK); ufs_makedirentry(ip, cnp, newdir); error = ufs_direnter(dvp, ulr, tvp, newdir, cnp, NULL); pool_cache_put(ufs_direct_cache, newdir); if (error) goto bad; *vpp = tvp; cache_enter(dvp, *vpp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_flags); return (0); bad: /* * Write error occurred trying to update the inode * or the directory so must deallocate the inode. */ ip->i_nlink = 0; DIP_ASSIGN(ip, nlink, 0); ip->i_flag |= IN_CHANGE; UFS_WAPBL_UPDATE(tvp, NULL, NULL, 0); UFS_WAPBL_END(dvp->v_mount); vput(tvp); return (error); } /* * Allocate len bytes at offset off. */ int ufs_gop_alloc(struct vnode *vp, off_t off, off_t len, int flags, kauth_cred_t cred) { struct inode *ip = VTOI(vp); int error, delta, bshift, bsize; UVMHIST_FUNC("ufs_gop_alloc"); UVMHIST_CALLED(ubchist); error = 0; bshift = vp->v_mount->mnt_fs_bshift; bsize = 1 << bshift; delta = off & (bsize - 1); off -= delta; len += delta; while (len > 0) { bsize = MIN(bsize, len); error = UFS_BALLOC(vp, off, bsize, cred, flags, NULL); if (error) { goto out; } /* * increase file size now, UFS_BALLOC() requires that * EOF be up-to-date before each call. */ if (ip->i_size < off + bsize) { UVMHIST_LOG(ubchist, "vp %#jx old 0x%jx new 0x%x", (uintptr_t)vp, ip->i_size, off + bsize, 0); ip->i_size = off + bsize; DIP_ASSIGN(ip, size, ip->i_size); } off += bsize; len -= bsize; } out: UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); return error; } void ufs_gop_markupdate(struct vnode *vp, int flags) { u_int32_t mask = 0; if ((flags & GOP_UPDATE_ACCESSED) != 0) { mask = IN_ACCESS; } if ((flags & GOP_UPDATE_MODIFIED) != 0) { if (vp->v_type == VREG) { mask |= IN_CHANGE | IN_UPDATE; } else { mask |= IN_MODIFY; } } if (mask) { struct inode *ip = VTOI(vp); ip->i_flag |= mask; } } int ufs_bufio(enum uio_rw rw, struct vnode *vp, void *buf, size_t len, off_t off, int ioflg, kauth_cred_t cred, size_t *aresid, struct lwp *l) { struct iovec iov; struct uio uio; int error; KASSERT(ISSET(ioflg, IO_NODELOCKED)); KASSERT(VOP_ISLOCKED(vp)); KASSERT(rw != UIO_WRITE || VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(rw != UIO_WRITE || vp->v_mount->mnt_wapbl == NULL || ISSET(ioflg, IO_JOURNALLOCKED)); iov.iov_base = buf; iov.iov_len = len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_resid = len; uio.uio_offset = off; uio.uio_rw = rw; UIO_SETUP_SYSSPACE(&uio); switch (rw) { case UIO_READ: error = UFS_BUFRD(vp, &uio, ioflg, cred); break; case UIO_WRITE: error = UFS_BUFWR(vp, &uio, ioflg, cred); break; default: panic("invalid uio rw: %d", (int)rw); } if (aresid) *aresid = uio.uio_resid; else if (uio.uio_resid && error == 0) error = EIO; KASSERT(VOP_ISLOCKED(vp)); KASSERT(rw != UIO_WRITE || VOP_ISLOCKED(vp) == LK_EXCLUSIVE); return error; }
2 2 3 3 4 3 3 3 3 3 2 2 3 3 3 3 3 3 3 4 4 4 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 1 1 3 3 3 3 2 3 3 2 1 3 3 3 3 3 3 2 1 1 1 2 2 2 3 3 3 2 2 2 2 1 2 2 2 2 2 1 3 3 3 1 1 1 3 3 7 7 3 7 3 3 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 /* $NetBSD: in6_pcb.c,v 1.177 2022/11/04 09:04:27 ozaki-r Exp $ */ /* $KAME: in6_pcb.c,v 1.84 2001/02/08 18:02:08 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.2 (Berkeley) 1/4/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: in6_pcb.c,v 1.177 2022/11/04 09:04:27 ozaki-r Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_ipsec.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/ioctl.h> #include <sys/errno.h> #include <sys/time.h> #include <sys/proc.h> #include <sys/kauth.h> #include <sys/domain.h> #include <sys/once.h> #include <net/if.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> #include <netinet/ip6.h> #include <netinet/portalgo.h> #include <netinet6/ip6_var.h> #include <netinet6/in6_pcb.h> #include <netinet6/scope6_var.h> #include "faith.h" #ifdef IPSEC #include <netipsec/ipsec.h> #include <netipsec/ipsec6.h> #include <netipsec/key.h> #endif /* IPSEC */ #include <netinet/tcp_vtw.h> const struct in6_addr zeroin6_addr; #define IN6PCBHASH_PORT(table, lport) \ &(table)->inpt_porthashtbl[ntohs(lport) & (table)->inpt_porthash] #define IN6PCBHASH_BIND(table, laddr, lport) \ &(table)->inpt_bindhashtbl[ \ (((laddr)->s6_addr32[0] ^ (laddr)->s6_addr32[1] ^ \ (laddr)->s6_addr32[2] ^ (laddr)->s6_addr32[3]) + ntohs(lport)) & \ (table)->inpt_bindhash] #define IN6PCBHASH_CONNECT(table, faddr, fport, laddr, lport) \ &(table)->inpt_bindhashtbl[ \ ((((faddr)->s6_addr32[0] ^ (faddr)->s6_addr32[1] ^ \ (faddr)->s6_addr32[2] ^ (faddr)->s6_addr32[3]) + ntohs(fport)) + \ (((laddr)->s6_addr32[0] ^ (laddr)->s6_addr32[1] ^ \ (laddr)->s6_addr32[2] ^ (laddr)->s6_addr32[3]) + \ ntohs(lport))) & (table)->inpt_bindhash] int ip6_anonportmin = IPV6PORT_ANONMIN; int ip6_anonportmax = IPV6PORT_ANONMAX; int ip6_lowportmin = IPV6PORT_RESERVEDMIN; int ip6_lowportmax = IPV6PORT_RESERVEDMAX; void in6pcb_init(struct inpcbtable *table, int bindhashsize, int connecthashsize) { inpcb_init(table, bindhashsize, connecthashsize); table->inpt_lastport = (in_port_t)ip6_anonportmax; } /* * Bind address from sin6 to inp. */ static int in6pcb_bind_addr(struct inpcb *inp, struct sockaddr_in6 *sin6, struct lwp *l) { int error; int s; /* * We should check the family, but old programs * incorrectly fail to initialize it. */ if (sin6->sin6_family != AF_INET6) return EAFNOSUPPORT; #ifndef INET if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) return EADDRNOTAVAIL; #endif if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0) return error; s = pserialize_read_enter(); if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; goto out; } if (sin6->sin6_addr.s6_addr32[3]) { struct sockaddr_in sin; memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof(sin); sin.sin_family = AF_INET; bcopy(&sin6->sin6_addr.s6_addr32[3], &sin.sin_addr, sizeof(sin.sin_addr)); if (!IN_MULTICAST(sin.sin_addr.s_addr)) { struct ifaddr *ifa; ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == NULL && (inp->inp_flags & IN6P_BINDANY) == 0) { error = EADDRNOTAVAIL; goto out; } } } } else if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { // succeed } else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct ifaddr *ifa = NULL; if ((inp->inp_flags & IN6P_FAITH) == 0) { ifa = ifa_ifwithaddr(sin6tosa(sin6)); if (ifa == NULL && (inp->inp_flags & IN6P_BINDANY) == 0) { error = EADDRNOTAVAIL; goto out; } } /* * bind to an anycast address might accidentally * cause sending a packet with an anycast source * address, so we forbid it. * * We should allow to bind to a deprecated address, * since the application dare to use it. * But, can we assume that they are careful enough * to check if the address is deprecated or not? * Maybe, as a safeguard, we should have a setsockopt * flag to control the bind(2) behavior against * deprecated addresses (default: forbid bind(2)). */ if (ifa && ifatoia6(ifa)->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_DUPLICATED)) { error = EADDRNOTAVAIL; goto out; } } in6p_laddr(inp) = sin6->sin6_addr; error = 0; out: pserialize_read_exit(s); return error; } /* * Bind port from sin6 to inp. */ static int in6pcb_bind_port(struct inpcb *inp, struct sockaddr_in6 *sin6, struct lwp *l) { struct inpcbtable *table = inp->inp_table; struct socket *so = inp->inp_socket; int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 && ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 || (so->so_options & SO_ACCEPTCONN) == 0)) wild = 1; if (sin6->sin6_port != 0) { enum kauth_network_req req; #ifndef IPNOPRIVPORTS if (ntohs(sin6->sin6_port) < IPV6PORT_RESERVED) req = KAUTH_REQ_NETWORK_BIND_PRIVPORT; else #endif /* IPNOPRIVPORTS */ req = KAUTH_REQ_NETWORK_BIND_PORT; error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_BIND, req, so, sin6, NULL); if (error) return EACCES; } if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow compepte duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if (so->so_options & (SO_REUSEADDR | SO_REUSEPORT)) reuseport = SO_REUSEADDR|SO_REUSEPORT; } if (sin6->sin6_port != 0) { if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { #ifdef INET struct inpcb *t; struct vestigial_inpcb vestige; t = inpcb_lookup_local(table, *(struct in_addr *)&sin6->sin6_addr.s6_addr32[3], sin6->sin6_port, wild, &vestige); if (t && (reuseport & t->inp_socket->so_options) == 0) return EADDRINUSE; if (!t && vestige.valid && !(reuseport && vestige.reuse_port)) return EADDRINUSE; #else return EADDRNOTAVAIL; #endif } { struct inpcb *t; struct vestigial_inpcb vestige; t = in6pcb_lookup_local(table, &sin6->sin6_addr, sin6->sin6_port, wild, &vestige); if (t && (reuseport & t->inp_socket->so_options) == 0) return EADDRINUSE; if (!t && vestige.valid && !(reuseport && vestige.reuse_port)) return EADDRINUSE; } } if (sin6->sin6_port == 0) { int e; e = in6pcb_set_port(sin6, inp, l); if (e != 0) return e; } else { inp->inp_lport = sin6->sin6_port; inpcb_set_state(inp, INP_BOUND); } LIST_REMOVE(inp, inp_lhash); LIST_INSERT_HEAD(IN6PCBHASH_PORT(table, inp->inp_lport), inp, inp_lhash); return 0; } int in6pcb_bind(void *v, struct sockaddr_in6 *sin6, struct lwp *l) { struct inpcb *inp = v; struct sockaddr_in6 lsin6; int error; if (inp->inp_af != AF_INET6) return EINVAL; /* * If we already have a local port or a local address it means we're * bounded. */ if (inp->inp_lport || !(IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) || (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)) && in6p_laddr(inp).s6_addr32[3] == 0))) return EINVAL; if (NULL != sin6) { /* We were provided a sockaddr_in6 to use. */ if (sin6->sin6_len != sizeof(*sin6)) return EINVAL; } else { /* We always bind to *something*, even if it's "anything". */ lsin6 = *((const struct sockaddr_in6 *) inp->inp_socket->so_proto->pr_domain->dom_sa_any); sin6 = &lsin6; } /* Bind address. */ error = in6pcb_bind_addr(inp, sin6, l); if (error) return error; /* Bind port. */ error = in6pcb_bind_port(inp, sin6, l); if (error) { /* * Reset the address here to "any" so we don't "leak" the * inpcb. */ in6p_laddr(inp) = in6addr_any; return error; } #if 0 in6p_flowinfo(inp) = 0; /* XXX */ #endif return 0; } /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin6. * If don't have a local address for this socket yet, * then pick one. */ int in6pcb_connect(void *v, struct sockaddr_in6 *sin6, struct lwp *l) { struct inpcb *inp = v; struct in6_addr *in6a = NULL; struct in6_addr ia6; struct ifnet *ifp = NULL; /* outgoing interface */ int error = 0; int scope_ambiguous = 0; #ifdef INET struct in6_addr mapped; #endif struct sockaddr_in6 tmp; struct vestigial_inpcb vestige; struct psref psref; int bound; (void)&in6a; /* XXX fool gcc */ if (inp->inp_af != AF_INET6) return EINVAL; if (sin6->sin6_len != sizeof(*sin6)) return EINVAL; if (sin6->sin6_family != AF_INET6) return EAFNOSUPPORT; if (sin6->sin6_port == 0) return EADDRNOTAVAIL; if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) && inp->inp_socket->so_type == SOCK_STREAM) return EADDRNOTAVAIL; if (sin6->sin6_scope_id == 0 && !ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0) return error; /* sanity check for mapped address case */ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) return EINVAL; if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) in6p_laddr(inp).s6_addr16[5] = htons(0xffff); if (!IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp))) return EINVAL; } else { if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp))) return EINVAL; } /* protect *sin6 from overwrites */ tmp = *sin6; sin6 = &tmp; bound = curlwp_bind(); /* Source address selection. */ if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)) && in6p_laddr(inp).s6_addr32[3] == 0) { #ifdef INET struct sockaddr_in sin; struct in_ifaddr *ia4; struct psref _psref; memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof(sin); sin.sin_family = AF_INET; memcpy(&sin.sin_addr, &sin6->sin6_addr.s6_addr32[3], sizeof(sin.sin_addr)); ia4 = in_selectsrc(&sin, &inp->inp_route, inp->inp_socket->so_options, NULL, &error, &_psref); if (ia4 == NULL) { if (error == 0) error = EADDRNOTAVAIL; curlwp_bindx(bound); return error; } memset(&mapped, 0, sizeof(mapped)); mapped.s6_addr16[5] = htons(0xffff); memcpy(&mapped.s6_addr32[3], &IA_SIN(ia4)->sin_addr, sizeof(IA_SIN(ia4)->sin_addr)); ia4_release(ia4, &_psref); in6a = &mapped; #else curlwp_bindx(bound); return EADDRNOTAVAIL; #endif } else { /* * XXX: in6_selectsrc might replace the bound local address * with the address specified by setsockopt(IPV6_PKTINFO). * Is it the intended behavior? */ error = in6_selectsrc(sin6, in6p_outputopts(inp), in6p_moptions(inp), &inp->inp_route, &in6p_laddr(inp), &ifp, &psref, &ia6); if (error == 0) in6a = &ia6; if (ifp && scope_ambiguous && (error = in6_setscope(&sin6->sin6_addr, ifp, NULL)) != 0) { if_put(ifp, &psref); curlwp_bindx(bound); return error; } if (in6a == NULL) { if_put(ifp, &psref); curlwp_bindx(bound); if (error == 0) error = EADDRNOTAVAIL; return error; } } if (ifp != NULL) { in6p_ip6(inp).ip6_hlim = (u_int8_t)in6pcb_selecthlim(inp, ifp); if_put(ifp, &psref); } else in6p_ip6(inp).ip6_hlim = (u_int8_t)in6pcb_selecthlim_rt(inp); curlwp_bindx(bound); if (in6pcb_lookup(inp->inp_table, &sin6->sin6_addr, sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) ? in6a : &in6p_laddr(inp), inp->inp_lport, 0, &vestige) || vestige.valid) return EADDRINUSE; if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) || (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)) && in6p_laddr(inp).s6_addr32[3] == 0)) { if (inp->inp_lport == 0) { error = in6pcb_bind(inp, NULL, l); if (error != 0) return error; } in6p_laddr(inp) = *in6a; } in6p_faddr(inp) = sin6->sin6_addr; inp->inp_fport = sin6->sin6_port; /* Late bind, if needed */ if (inp->inp_bindportonsend) { struct sockaddr_in6 lsin = *((const struct sockaddr_in6 *) inp->inp_socket->so_proto->pr_domain->dom_sa_any); lsin.sin6_addr = in6p_laddr(inp); lsin.sin6_port = 0; if ((error = in6pcb_bind_port(inp, &lsin, l)) != 0) return error; } inpcb_set_state(inp, INP_CONNECTED); in6p_flowinfo(inp) &= ~IPV6_FLOWLABEL_MASK; if (ip6_auto_flowlabel) in6p_flowinfo(inp) |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); #if defined(IPSEC) if (ipsec_enabled && inp->inp_socket->so_type == SOCK_STREAM) ipsec_pcbconn(inp->inp_sp); #endif return 0; } void in6pcb_disconnect(struct inpcb *inp) { memset((void *)&in6p_faddr(inp), 0, sizeof(in6p_faddr(inp))); inp->inp_fport = 0; inpcb_set_state(inp, INP_BOUND); in6p_flowinfo(inp) &= ~IPV6_FLOWLABEL_MASK; #if defined(IPSEC) if (ipsec_enabled) ipsec_pcbdisconn(inp->inp_sp); #endif if (inp->inp_socket->so_state & SS_NOFDREF) inpcb_destroy(inp); } void in6pcb_fetch_sockaddr(struct inpcb *inp, struct sockaddr_in6 *sin6) { if (inp->inp_af != AF_INET6) return; sockaddr_in6_init(sin6, &in6p_laddr(inp), inp->inp_lport, 0, 0); (void)sa6_recoverscope(sin6); /* XXX: should catch errors */ } void in6pcb_fetch_peeraddr(struct inpcb *inp, struct sockaddr_in6 *sin6) { if (inp->inp_af != AF_INET6) return; sockaddr_in6_init(sin6, &in6p_faddr(inp), inp->inp_fport, 0, 0); (void)sa6_recoverscope(sin6); /* XXX: should catch errors */ } /* * Pass some notification to all connections of a protocol * associated with address dst. The local address and/or port numbers * may be specified to limit the search. The "usual action" will be * taken, depending on the ctlinput cmd. The caller must filter any * cmds that are uninteresting (e.g., no error in the map). * Call the protocol specific routine (if any) to report * any errors for each matching socket. * * Must be called at splsoftnet. * * Note: src (4th arg) carries the flowlabel value on the original IPv6 * header, in sin6_flowinfo member. */ int in6pcb_notify(struct inpcbtable *table, const struct sockaddr *dst, u_int fport_arg, const struct sockaddr *src, u_int lport_arg, int cmd, void *cmdarg, void (*notify)(struct inpcb *, int)) { struct inpcb *inp; struct sockaddr_in6 sa6_src; const struct sockaddr_in6 *sa6_dst; in_port_t fport = fport_arg, lport = lport_arg; int errno; int nmatch = 0; u_int32_t flowinfo; if ((unsigned)cmd >= PRC_NCMDS || dst->sa_family != AF_INET6) return 0; sa6_dst = (const struct sockaddr_in6 *)dst; if (IN6_IS_ADDR_UNSPECIFIED(&sa6_dst->sin6_addr)) return 0; /* * note that src can be NULL when we get notify by local fragmentation. */ sa6_src = (src == NULL) ? sa6_any : *(const struct sockaddr_in6 *)src; flowinfo = sa6_src.sin6_flowinfo; /* * Redirects go to all references to the destination, * and use in6pcb_rtchange to invalidate the route cache. * Dead host indications: also use in6pcb_rtchange to invalidate * the cache, and deliver the error to all the sockets. * Otherwise, if we have knowledge of the local port and address, * deliver only to that socket. */ if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) { fport = 0; lport = 0; memset((void *)&sa6_src.sin6_addr, 0, sizeof(sa6_src.sin6_addr)); if (cmd != PRC_HOSTDEAD) notify = in6pcb_rtchange; } errno = inet6ctlerrmap[cmd]; TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { struct rtentry *rt = NULL; if (inp->inp_af != AF_INET6) continue; /* * Under the following condition, notify of redirects * to the pcb, without making address matches against inpcb. * - redirect notification is arrived. * - the inpcb is unconnected. * - the inpcb is caching !RTF_HOST routing entry. * - the ICMPv6 notification is from the gateway cached in the * inpcb. i.e. ICMPv6 notification is from nexthop gateway * the inpcb used very recently. * * This is to improve interaction between netbsd/openbsd * redirect handling code, and inpcb route cache code. * without the clause, !RTF_HOST routing entry (which carries * gateway used by inpcb right before the ICMPv6 redirect) * will be cached forever in unconnected inpcb. * * There still is a question regarding to what is TRT: * - On bsdi/freebsd, RTF_HOST (cloned) routing entry will be * generated on packet output. inpcb will always cache * RTF_HOST routing entry so there's no need for the clause * (ICMPv6 redirect will update RTF_HOST routing entry, * and inpcb is caching it already). * However, bsdi/freebsd are vulnerable to local DoS attacks * due to the cloned routing entries. * - Specwise, "destination cache" is mentioned in RFC2461. * Jinmei says that it implies bsdi/freebsd behavior, itojun * is not really convinced. * - Having hiwat/lowat on # of cloned host route (redirect/ * pmtud) may be a good idea. netbsd/openbsd has it. see * icmp6_mtudisc_update(). */ if ((PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) && IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) && (rt = rtcache_validate(&inp->inp_route)) != NULL && !(rt->rt_flags & RTF_HOST)) { const struct sockaddr_in6 *dst6; dst6 = (const struct sockaddr_in6 *) rtcache_getdst(&inp->inp_route); if (dst6 == NULL) ; else if (IN6_ARE_ADDR_EQUAL(&dst6->sin6_addr, &sa6_dst->sin6_addr)) { rtcache_unref(rt, &inp->inp_route); goto do_notify; } } rtcache_unref(rt, &inp->inp_route); /* * If the error designates a new path MTU for a destination * and the application (associated with this socket) wanted to * know the value, notify. Note that we notify for all * disconnected sockets if the corresponding application * wanted. This is because some UDP applications keep sending * sockets disconnected. * XXX: should we avoid to notify the value to TCP sockets? */ if (cmd == PRC_MSGSIZE && (inp->inp_flags & IN6P_MTU) != 0 && (IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)) || IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), &sa6_dst->sin6_addr))) { ip6_notify_pmtu(inp, (const struct sockaddr_in6 *)dst, (u_int32_t *)cmdarg); } /* * Detect if we should notify the error. If no source and * destination ports are specified, but non-zero flowinfo and * local address match, notify the error. This is the case * when the error is delivered with an encrypted buffer * by ESP. Otherwise, just compare addresses and ports * as usual. */ if (lport == 0 && fport == 0 && flowinfo && inp->inp_socket != NULL && flowinfo == (in6p_flowinfo(inp) & IPV6_FLOWLABEL_MASK) && IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &sa6_src.sin6_addr)) goto do_notify; else if (!IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), &sa6_dst->sin6_addr) || inp->inp_socket == NULL || (lport && inp->inp_lport != lport) || (!IN6_IS_ADDR_UNSPECIFIED(&sa6_src.sin6_addr) && !IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &sa6_src.sin6_addr)) || (fport && inp->inp_fport != fport)) continue; do_notify: if (notify) (*notify)(inp, errno); nmatch++; } return nmatch; } void in6pcb_purgeif0(struct inpcbtable *table, struct ifnet *ifp) { struct inpcb *inp; struct ip6_moptions *im6o; struct in6_multi_mship *imm, *nimm; KASSERT(ifp != NULL); TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { bool need_unlock = false; if (inp->inp_af != AF_INET6) continue; /* The caller holds either one of inps' lock */ if (!inp_locked(inp)) { inp_lock(inp); need_unlock = true; } im6o = in6p_moptions(inp); if (im6o) { /* * Unselect the outgoing interface if it is being * detached. */ if (im6o->im6o_multicast_if_index == ifp->if_index) im6o->im6o_multicast_if_index = 0; /* * Drop multicast group membership if we joined * through the interface being detached. * XXX controversial - is it really legal for kernel * to force this? */ LIST_FOREACH_SAFE(imm, &im6o->im6o_memberships, i6mm_chain, nimm) { if (imm->i6mm_maddr->in6m_ifp == ifp) { LIST_REMOVE(imm, i6mm_chain); in6_leavegroup(imm); } } } in_purgeifmcast(inp->inp_moptions, ifp); if (need_unlock) inp_unlock(inp); } } void in6pcb_purgeif(struct inpcbtable *table, struct ifnet *ifp) { struct rtentry *rt; struct inpcb *inp; TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { if (inp->inp_af != AF_INET6) continue; if ((rt = rtcache_validate(&inp->inp_route)) != NULL && rt->rt_ifp == ifp) { rtcache_unref(rt, &inp->inp_route); in6pcb_rtchange(inp, 0); } else rtcache_unref(rt, &inp->inp_route); } } /* * After a routing change, flush old routing. A new route can be * allocated the next time output is attempted. */ void in6pcb_rtchange(struct inpcb *inp, int errno) { if (inp->inp_af != AF_INET6) return; rtcache_free(&inp->inp_route); /* * A new route can be allocated the next time * output is attempted. */ } struct inpcb * in6pcb_lookup_local(struct inpcbtable *table, struct in6_addr *laddr6, u_int lport_arg, int lookup_wildcard, struct vestigial_inpcb *vp) { struct inpcbhead *head; struct inpcb *inp, *match = NULL; int matchwild = 3, wildcard; in_port_t lport = lport_arg; if (vp) vp->valid = 0; head = IN6PCBHASH_PORT(table, lport); LIST_FOREACH(inp, head, inp_lhash) { if (inp->inp_af != AF_INET6) continue; if (inp->inp_lport != lport) continue; wildcard = 0; if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) continue; } if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) wildcard++; if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp))) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) continue; if (!IN6_IS_ADDR_V4MAPPED(laddr6)) continue; /* duplicate of IPv4 logic */ wildcard = 0; if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp)) && in6p_faddr(inp).s6_addr32[3]) wildcard++; if (!in6p_laddr(inp).s6_addr32[3]) { if (laddr6->s6_addr32[3]) wildcard++; } else { if (!laddr6->s6_addr32[3]) wildcard++; else { if (in6p_laddr(inp).s6_addr32[3] != laddr6->s6_addr32[3]) continue; } } } else if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) { if (IN6_IS_ADDR_V4MAPPED(laddr6)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) continue; } if (!IN6_IS_ADDR_UNSPECIFIED(laddr6)) wildcard++; } else { if (IN6_IS_ADDR_V4MAPPED(laddr6)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) continue; } if (IN6_IS_ADDR_UNSPECIFIED(laddr6)) wildcard++; else { if (!IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), laddr6)) continue; } } if (wildcard && !lookup_wildcard) continue; if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } if (match && matchwild == 0) return match; if (vp && table->vestige && table->vestige->init_ports6) { struct vestigial_inpcb better; bool has_better = false; void *state; state = (*table->vestige->init_ports6)(laddr6, lport_arg, lookup_wildcard); while (table->vestige && (*table->vestige->next_port6)(state, vp)) { if (vp->lport != lport) continue; wildcard = 0; if (!IN6_IS_ADDR_UNSPECIFIED(&vp->faddr.v6)) wildcard++; if (IN6_IS_ADDR_UNSPECIFIED(&vp->laddr.v6)) { if (!IN6_IS_ADDR_UNSPECIFIED(laddr6)) wildcard++; } else { if (IN6_IS_ADDR_V4MAPPED(laddr6)) { if (vp->v6only) continue; } if (IN6_IS_ADDR_UNSPECIFIED(laddr6)) wildcard++; else { if (!IN6_ARE_ADDR_EQUAL(&vp->laddr.v6, laddr6)) continue; } } if (wildcard && !lookup_wildcard) continue; if (wildcard < matchwild) { better = *vp; has_better = true; matchwild = wildcard; if (matchwild == 0) break; } } if (has_better) { *vp = better; return 0; } } return match; } /* * WARNING: return value (rtentry) could be IPv4 one if inpcb is connected to * IPv4 mapped address. */ struct rtentry * in6pcb_rtentry(struct inpcb *inp) { struct rtentry *rt; struct route *ro; union { const struct sockaddr *sa; const struct sockaddr_in6 *sa6; #ifdef INET const struct sockaddr_in *sa4; #endif } cdst; ro = &inp->inp_route; if (inp->inp_af != AF_INET6) return NULL; cdst.sa = rtcache_getdst(ro); if (cdst.sa == NULL) ; #ifdef INET else if (cdst.sa->sa_family == AF_INET) { KASSERT(IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))); if (cdst.sa4->sin_addr.s_addr != in6p_faddr(inp).s6_addr32[3]) rtcache_free(ro); } #endif else { if (!IN6_ARE_ADDR_EQUAL(&cdst.sa6->sin6_addr, &in6p_faddr(inp))) rtcache_free(ro); } if ((rt = rtcache_validate(ro)) == NULL) rt = rtcache_update(ro, 1); #ifdef INET if (rt == NULL && IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) { union { struct sockaddr dst; struct sockaddr_in dst4; } u; struct in_addr addr; addr.s_addr = in6p_faddr(inp).s6_addr32[3]; sockaddr_in_init(&u.dst4, &addr, 0); if (rtcache_setdst(ro, &u.dst) != 0) return NULL; rt = rtcache_init(ro); } else #endif if (rt == NULL && !IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) { union { struct sockaddr dst; struct sockaddr_in6 dst6; } u; sockaddr_in6_init(&u.dst6, &in6p_faddr(inp), 0, 0, 0); if (rtcache_setdst(ro, &u.dst) != 0) return NULL; rt = rtcache_init(ro); } return rt; } void in6pcb_rtentry_unref(struct rtentry *rt, struct inpcb *inp) { rtcache_unref(rt, &inp->inp_route); } struct inpcb * in6pcb_lookup(struct inpcbtable *table, const struct in6_addr *faddr6, u_int fport_arg, const struct in6_addr *laddr6, u_int lport_arg, int faith, struct vestigial_inpcb *vp) { struct inpcbhead *head; struct inpcb *inp; in_port_t fport = fport_arg, lport = lport_arg; if (vp) vp->valid = 0; head = IN6PCBHASH_CONNECT(table, faddr6, fport, laddr6, lport); LIST_FOREACH(inp, head, inp_hash) { if (inp->inp_af != AF_INET6) continue; /* find exact match on both source and dest */ if (inp->inp_fport != fport) continue; if (inp->inp_lport != lport) continue; if (IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) continue; if (!IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), faddr6)) continue; if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) continue; if (!IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), laddr6)) continue; if ((IN6_IS_ADDR_V4MAPPED(laddr6) || IN6_IS_ADDR_V4MAPPED(faddr6)) && (inp->inp_flags & IN6P_IPV6_V6ONLY)) continue; return inp; } if (vp && table->vestige) { if ((*table->vestige->lookup6)(faddr6, fport_arg, laddr6, lport_arg, vp)) return NULL; } return NULL; } struct inpcb * in6pcb_lookup_bound(struct inpcbtable *table, const struct in6_addr *laddr6, u_int lport_arg, int faith) { struct inpcbhead *head; struct inpcb *inp; in_port_t lport = lport_arg; #ifdef INET struct in6_addr zero_mapped; #endif head = IN6PCBHASH_BIND(table, laddr6, lport); LIST_FOREACH(inp, head, inp_hash) { if (inp->inp_af != AF_INET6) continue; if (faith && (inp->inp_flags & IN6P_FAITH) == 0) continue; if (inp->inp_fport != 0) continue; if (inp->inp_lport != lport) continue; if (IN6_IS_ADDR_V4MAPPED(laddr6) && (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) continue; if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), laddr6)) goto out; } #ifdef INET if (IN6_IS_ADDR_V4MAPPED(laddr6)) { memset(&zero_mapped, 0, sizeof(zero_mapped)); zero_mapped.s6_addr16[5] = 0xffff; head = IN6PCBHASH_BIND(table, &zero_mapped, lport); LIST_FOREACH(inp, head, inp_hash) { if (inp->inp_af != AF_INET6) continue; if (faith && (inp->inp_flags & IN6P_FAITH) == 0) continue; if (inp->inp_fport != 0) continue; if (inp->inp_lport != lport) continue; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) continue; if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &zero_mapped)) goto out; } } #endif head = IN6PCBHASH_BIND(table, &zeroin6_addr, lport); LIST_FOREACH(inp, head, inp_hash) { if (inp->inp_af != AF_INET6) continue; if (faith && (inp->inp_flags & IN6P_FAITH) == 0) continue; if (inp->inp_fport != 0) continue; if (inp->inp_lport != lport) continue; if (IN6_IS_ADDR_V4MAPPED(laddr6) && (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) continue; if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &zeroin6_addr)) goto out; } return NULL; out: if (inp != LIST_FIRST(head)) { LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); } return inp; } void in6pcb_set_state(struct inpcb *inp, int state) { if (inp->inp_af != AF_INET6) return; if (inp->inp_state > INP_ATTACHED) LIST_REMOVE(inp, inp_hash); switch (state) { case INP_BOUND: LIST_INSERT_HEAD(IN6PCBHASH_BIND(inp->inp_table, &in6p_laddr(inp), inp->inp_lport), inp, inp_hash); break; case INP_CONNECTED: LIST_INSERT_HEAD(IN6PCBHASH_CONNECT(inp->inp_table, &in6p_faddr(inp), inp->inp_fport, &in6p_laddr(inp), inp->inp_lport), inp, inp_hash); break; } inp->inp_state = state; }
3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 /* $NetBSD: rtsock_shared.c,v 1.23 2022/10/04 07:06:31 msaitoh Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: rtsock_shared.c,v 1.23 2022/10/04 07:06:31 msaitoh Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_net_mpsafe.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/domain.h> #include <sys/protosw.h> #include <sys/sysctl.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/intr.h> #include <sys/condvar.h> #include <sys/compat_stub.h> #include <net/if.h> #include <net/if_llatbl.h> #include <net/if_types.h> #include <net/route.h> #include <net/raw_cb.h> #include <netinet/in_var.h> #include <netinet/if_inarp.h> #include <netmpls/mpls.h> #include <compat/net/if.h> #include <compat/net/route.h> /* sa_family is after sa_len, rest is data */ #define _SA_MINSIZE (offsetof(struct sockaddr, sa_family) + \ sizeof(((struct sockaddr *)0)->sa_family)) #ifdef COMPAT_RTSOCK /* * These are used when #include-d from compat/common/rtsock_50.c */ #define RTM_XVERSION RTM_OVERSION #define RTM_XNEWADDR RTM_ONEWADDR #define RTM_XDELADDR RTM_ODELADDR #define RTM_XCHGADDR RTM_OCHGADDR #define RT_XADVANCE(a,b) RT_OADVANCE(a,b) #define RT_XROUNDUP(n) RT_OROUNDUP(n) #define PF_XROUTE PF_OROUTE #define rt_xmsghdr rt_msghdr50 #define if_xmsghdr if_msghdr /* if_msghdr50 is for RTM_OIFINFO */ #define ifa_xmsghdr ifa_msghdr50 #define if_xannouncemsghdr if_announcemsghdr50 #define COMPATNAME(x) compat_50_ ## x #define DOMAINNAME "oroute" #define COMPATCALL(name, args) \ MODULE_HOOK_CALL_VOID(rtsock_ ## name ## _50_hook, args, __nothing); #define RTS_CTASSERT(x) __nothing CTASSERT(sizeof(struct ifa_xmsghdr) == 20); DOMAIN_DEFINE(compat_50_routedomain); /* forward declare and add to link set */ #else /* COMPAT_RTSOCK */ /* * These are used normally, when not #include-d from compat/common/rtsock_50.c */ #define RTM_XVERSION RTM_VERSION #define RTM_XNEWADDR RTM_NEWADDR #define RTM_XDELADDR RTM_DELADDR #define RTM_XCHGADDR RTM_CHGADDR #define RT_XADVANCE(a,b) RT_ADVANCE(a,b) #define RT_XROUNDUP(n) RT_ROUNDUP(n) #define PF_XROUTE PF_ROUTE #define rt_xmsghdr rt_msghdr #define if_xmsghdr if_msghdr #define ifa_xmsghdr ifa_msghdr #define if_xannouncemsghdr if_announcemsghdr #define COMPATNAME(x) x #define DOMAINNAME "route" #define COMPATCALL(name, args) __nothing; #define RTS_CTASSERT(x) CTASSERT(x) CTASSERT(sizeof(struct ifa_xmsghdr) == 32); DOMAIN_DEFINE(routedomain); /* forward declare and add to link set */ #endif /* COMPAT_RTSOCK */ #ifdef RTSOCK_DEBUG #define RT_IN_PRINT(info, b, a) (in_print((b), sizeof(b), \ &((const struct sockaddr_in *)(info)->rti_info[(a)])->sin_addr), (b)) #endif /* RTSOCK_DEBUG */ struct route_info COMPATNAME(route_info) = { .ri_dst = { .sa_len = 2, .sa_family = PF_XROUTE, }, .ri_src = { .sa_len = 2, .sa_family = PF_XROUTE, }, .ri_maxqlen = IFQ_MAXLEN, }; static void COMPATNAME(route_init)(void); static int COMPATNAME(route_output)(struct mbuf *, struct socket *); static int rt_xaddrs(u_char, const char *, const char *, struct rt_addrinfo *); static struct mbuf *rt_makeifannouncemsg(struct ifnet *, int, int, struct rt_addrinfo *); static int rt_msg2(int, struct rt_addrinfo *, void *, struct rt_walkarg *, int *); static void _rt_setmetrics(int, const struct rt_xmsghdr *, struct rtentry *); static void rtm_setmetrics(const struct rtentry *, struct rt_xmsghdr *); static void rt_adjustcount(int, int); static const struct protosw COMPATNAME(route_protosw)[]; struct routecb { struct rawcb rocb_rcb; unsigned int rocb_msgfilter; #define RTMSGFILTER(m) (1U << (m)) char *rocb_missfilter; size_t rocb_missfilterlen; }; #define sotoroutecb(so) ((struct routecb *)(so)->so_pcb) static struct rawcbhead rt_rawcb; #ifdef NET_MPSAFE static kmutex_t *rt_so_mtx; static bool rt_updating = false; static kcondvar_t rt_update_cv; #endif static void rt_adjustcount(int af, int cnt) { struct route_cb * const cb = &COMPATNAME(route_info).ri_cb; cb->any_count += cnt; switch (af) { case AF_INET: cb->ip_count += cnt; return; #ifdef INET6 case AF_INET6: cb->ip6_count += cnt; return; #endif case AF_MPLS: cb->mpls_count += cnt; return; } } static int COMPATNAME(route_filter)(struct mbuf *m, struct sockproto *proto, struct rawcb *rp) { struct routecb *rop = (struct routecb *)rp; struct rt_xmsghdr rtm; KASSERT(m != NULL); KASSERT(proto != NULL); KASSERT(rp != NULL); /* Wrong family for this socket. */ if (proto->sp_family != PF_ROUTE) return ENOPROTOOPT; /* If no filter set, just return. */ if (rop->rocb_msgfilter == 0 && rop->rocb_missfilterlen == 0) return 0; /* Ensure we can access rtm_type */ if (m->m_len < offsetof(struct rt_xmsghdr, rtm_type) + sizeof(rtm.rtm_type)) return EINVAL; m_copydata(m, offsetof(struct rt_xmsghdr, rtm_type), sizeof(rtm.rtm_type), &rtm.rtm_type); if (rtm.rtm_type >= sizeof(rop->rocb_msgfilter) * CHAR_BIT) return EINVAL; /* If the rtm type is filtered out, return a positive. */ if (rop->rocb_msgfilter != 0 && !(rop->rocb_msgfilter & RTMSGFILTER(rtm.rtm_type))) return EEXIST; if (rop->rocb_missfilterlen != 0 && rtm.rtm_type == RTM_MISS) { __CTASSERT(RTAX_DST == 0); struct sockaddr_storage ss; struct sockaddr *dst = (struct sockaddr *)&ss, *sa; char *cp = rop->rocb_missfilter; char *ep = cp + rop->rocb_missfilterlen; /* Ensure we can access sa_len */ if (m->m_pkthdr.len < sizeof(rtm) + _SA_MINSIZE) return EINVAL; m_copydata(m, sizeof(rtm) + offsetof(struct sockaddr, sa_len), sizeof(ss.ss_len), &ss.ss_len); if (ss.ss_len < _SA_MINSIZE || ss.ss_len > sizeof(ss) || m->m_pkthdr.len < sizeof(rtm) + ss.ss_len) return EINVAL; /* Copy out the destination sockaddr */ m_copydata(m, sizeof(rtm), ss.ss_len, &ss); /* Find a matching sockaddr in the filter */ while (cp < ep) { sa = (struct sockaddr *)cp; if (sa->sa_len == dst->sa_len && memcmp(sa, dst, sa->sa_len) == 0) break; cp += RT_XROUNDUP(sa->sa_len); } if (cp == ep) return EEXIST; } /* Passed the filter. */ return 0; } static void rt_pr_init(void) { LIST_INIT(&rt_rawcb); } static int COMPATNAME(route_attach)(struct socket *so, int proto) { struct rawcb *rp; struct routecb *rop; int s, error; KASSERT(sotorawcb(so) == NULL); rop = kmem_zalloc(sizeof(*rop), KM_SLEEP); rp = &rop->rocb_rcb; rp->rcb_len = sizeof(*rop); so->so_pcb = rp; s = splsoftnet(); #ifdef NET_MPSAFE KASSERT(so->so_lock == NULL); mutex_obj_hold(rt_so_mtx); so->so_lock = rt_so_mtx; solock(so); #endif if ((error = raw_attach(so, proto, &rt_rawcb)) == 0) { rt_adjustcount(rp->rcb_proto.sp_protocol, 1); rp->rcb_laddr = &COMPATNAME(route_info).ri_src; rp->rcb_faddr = &COMPATNAME(route_info).ri_dst; rp->rcb_filter = COMPATNAME(route_filter); } splx(s); if (error) { kmem_free(rop, sizeof(*rop)); so->so_pcb = NULL; return error; } soisconnected(so); so->so_options |= SO_USELOOPBACK; KASSERT(solocked(so)); return error; } static void COMPATNAME(route_detach)(struct socket *so) { struct rawcb *rp = sotorawcb(so); struct routecb *rop = (struct routecb *)rp; int s; KASSERT(rp != NULL); KASSERT(solocked(so)); s = splsoftnet(); if (rop->rocb_missfilterlen != 0) kmem_free(rop->rocb_missfilter, rop->rocb_missfilterlen); rt_adjustcount(rp->rcb_proto.sp_protocol, -1); raw_detach(so); splx(s); } static int COMPATNAME(route_accept)(struct socket *so, struct sockaddr *nam) { KASSERT(solocked(so)); panic("route_accept"); return EOPNOTSUPP; } static int COMPATNAME(route_bind)(struct socket *so, struct sockaddr *nam, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int COMPATNAME(route_listen)(struct socket *so, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int COMPATNAME(route_connect)(struct socket *so, struct sockaddr *nam, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int COMPATNAME(route_connect2)(struct socket *so, struct socket *so2) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int COMPATNAME(route_disconnect)(struct socket *so) { struct rawcb *rp = sotorawcb(so); int s; KASSERT(solocked(so)); KASSERT(rp != NULL); s = splsoftnet(); soisdisconnected(so); raw_disconnect(rp); splx(s); return 0; } static int COMPATNAME(route_shutdown)(struct socket *so) { int s; KASSERT(solocked(so)); /* * Mark the connection as being incapable of further input. */ s = splsoftnet(); socantsendmore(so); splx(s); return 0; } static int COMPATNAME(route_abort)(struct socket *so) { KASSERT(solocked(so)); panic("route_abort"); return EOPNOTSUPP; } static int COMPATNAME(route_ioctl)(struct socket *so, u_long cmd, void *nam, struct ifnet * ifp) { return EOPNOTSUPP; } static int COMPATNAME(route_stat)(struct socket *so, struct stat *ub) { KASSERT(solocked(so)); return 0; } static int COMPATNAME(route_peeraddr)(struct socket *so, struct sockaddr *nam) { struct rawcb *rp = sotorawcb(so); KASSERT(solocked(so)); KASSERT(rp != NULL); KASSERT(nam != NULL); if (rp->rcb_faddr == NULL) return ENOTCONN; raw_setpeeraddr(rp, nam); return 0; } static int COMPATNAME(route_sockaddr)(struct socket *so, struct sockaddr *nam) { struct rawcb *rp = sotorawcb(so); KASSERT(solocked(so)); KASSERT(rp != NULL); KASSERT(nam != NULL); if (rp->rcb_faddr == NULL) return ENOTCONN; raw_setsockaddr(rp, nam); return 0; } static int COMPATNAME(route_rcvd)(struct socket *so, int flags, struct lwp *l) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int COMPATNAME(route_recvoob)(struct socket *so, struct mbuf *m, int flags) { KASSERT(solocked(so)); return EOPNOTSUPP; } static int COMPATNAME(route_send)(struct socket *so, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct lwp *l) { int error = 0; int s; KASSERT(solocked(so)); KASSERT(so->so_proto == &COMPATNAME(route_protosw)[0]); s = splsoftnet(); error = raw_send(so, m, nam, control, l, &COMPATNAME(route_output)); splx(s); return error; } static int COMPATNAME(route_sendoob)(struct socket *so, struct mbuf *m, struct mbuf *control) { KASSERT(solocked(so)); m_freem(m); m_freem(control); return EOPNOTSUPP; } static int COMPATNAME(route_purgeif)(struct socket *so, struct ifnet *ifp) { panic("route_purgeif"); return EOPNOTSUPP; } #if defined(INET) || defined(INET6) static int route_get_sdl_index(struct rt_addrinfo *info, int *sdl_index) { struct rtentry *nrt; int error; error = rtrequest1(RTM_GET, info, &nrt); if (error != 0) return error; /* * nrt->rt_ifp->if_index may not be correct * due to changing to ifplo0. */ *sdl_index = satosdl(nrt->rt_gateway)->sdl_index; rt_unref(nrt); return 0; } #endif static void route_get_sdl(const struct ifnet *ifp, const struct sockaddr *dst, struct sockaddr_dl *sdl, int *flags) { struct llentry *la; KASSERT(ifp != NULL); IF_AFDATA_RLOCK(ifp); switch (dst->sa_family) { case AF_INET: la = lla_lookup(LLTABLE(ifp), 0, dst); break; case AF_INET6: la = lla_lookup(LLTABLE6(ifp), 0, dst); break; default: la = NULL; KASSERTMSG(0, "Invalid AF=%d\n", dst->sa_family); break; } IF_AFDATA_RUNLOCK(ifp); void *a = (LLE_IS_VALID(la) && (la->la_flags & LLE_VALID) == LLE_VALID) ? &la->ll_addr : NULL; a = sockaddr_dl_init(sdl, sizeof(*sdl), ifp->if_index, ifp->if_type, NULL, 0, a, ifp->if_addrlen); KASSERT(a != NULL); if (la != NULL) { *flags = la->la_flags; LLE_RUNLOCK(la); } } static int route_output_report(struct rtentry *rt, struct rt_addrinfo *info, struct rt_xmsghdr *rtm, struct rt_xmsghdr **new_rtm) { int len, error; if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { const struct ifaddr *rtifa; const struct ifnet *ifp = rt->rt_ifp; info->rti_info[RTAX_IFP] = ifp->if_dl->ifa_addr; /* rtifa used to be simply rt->rt_ifa. * If rt->rt_ifa != NULL, then * rt_get_ifa() != NULL. So this * ought to still be safe. --dyoung */ rtifa = rt_get_ifa(rt); info->rti_info[RTAX_IFA] = rtifa->ifa_addr; #ifdef RTSOCK_DEBUG if (info->rti_info[RTAX_IFA]->sa_family == AF_INET) { char ibuf[INET_ADDRSTRLEN]; char abuf[INET_ADDRSTRLEN]; printf("%s: copying out RTAX_IFA %s " "for info->rti_info[RTAX_DST] %s " "ifa_getifa %p ifa_seqno %p\n", __func__, RT_IN_PRINT(info, ibuf, RTAX_IFA), RT_IN_PRINT(info, abuf, RTAX_DST), (void *)rtifa->ifa_getifa, rtifa->ifa_seqno); } #endif /* RTSOCK_DEBUG */ if (ifp->if_flags & IFF_POINTOPOINT) info->rti_info[RTAX_BRD] = rtifa->ifa_dstaddr; else info->rti_info[RTAX_BRD] = NULL; rtm->rtm_index = ifp->if_index; } error = rt_msg2(rtm->rtm_type, info, NULL, NULL, &len); if (error) return error; if (len > rtm->rtm_msglen) { struct rt_xmsghdr *old_rtm = rtm; R_Malloc(*new_rtm, struct rt_xmsghdr *, len); if (*new_rtm == NULL) return ENOBUFS; (void)memcpy(*new_rtm, old_rtm, old_rtm->rtm_msglen); rtm = *new_rtm; } (void)rt_msg2(rtm->rtm_type, info, rtm, NULL, 0); rtm->rtm_flags = rt->rt_flags; rtm_setmetrics(rt, rtm); rtm->rtm_addrs = info->rti_addrs; return 0; } /*ARGSUSED*/ int COMPATNAME(route_output)(struct mbuf *m, struct socket *so) { struct sockproto proto = { .sp_family = PF_XROUTE, }; struct rt_xmsghdr hdr; struct rt_xmsghdr *rtm = NULL; struct rt_xmsghdr *old_rtm = NULL, *new_rtm = NULL; struct rtentry *rt = NULL; struct rtentry *saved_nrt = NULL; struct rt_addrinfo info; int len, error = 0; sa_family_t family; struct sockaddr_dl sdl; int bound = curlwp_bind(); bool do_rt_free = false; struct sockaddr_storage netmask; #define senderr(e) do { error = e; goto flush;} while (/*CONSTCOND*/ 0) if (m == NULL || ((m->m_len < sizeof(int32_t)) && (m = m_pullup(m, sizeof(int32_t))) == NULL)) { error = ENOBUFS; goto out; } if ((m->m_flags & M_PKTHDR) == 0) panic("%s", __func__); len = m->m_pkthdr.len; if (len < sizeof(*rtm)) { info.rti_info[RTAX_DST] = NULL; senderr(EINVAL); } m_copydata(m, 0, sizeof(hdr), &hdr); if (len != hdr.rtm_msglen) { info.rti_info[RTAX_DST] = NULL; senderr(EINVAL); } R_Malloc(rtm, struct rt_xmsghdr *, len); if (rtm == NULL) { info.rti_info[RTAX_DST] = NULL; senderr(ENOBUFS); } m_copydata(m, 0, len, rtm); if (rtm->rtm_version != RTM_XVERSION) { info.rti_info[RTAX_DST] = NULL; senderr(EPROTONOSUPPORT); } rtm->rtm_pid = curproc->p_pid; memset(&info, 0, sizeof(info)); info.rti_addrs = rtm->rtm_addrs; if (rt_xaddrs(rtm->rtm_type, (const char *)(rtm + 1), len + (char *)rtm, &info)) { senderr(EINVAL); } info.rti_flags = rtm->rtm_flags; if (info.rti_info[RTAX_DST] == NULL || (info.rti_info[RTAX_DST]->sa_family >= AF_MAX)) { senderr(EINVAL); } #ifdef RTSOCK_DEBUG if (info.rti_info[RTAX_DST]->sa_family == AF_INET) { char abuf[INET_ADDRSTRLEN]; printf("%s: extracted info.rti_info[RTAX_DST] %s\n", __func__, RT_IN_PRINT(&info, abuf, RTAX_DST)); } #endif /* RTSOCK_DEBUG */ if (info.rti_info[RTAX_GATEWAY] != NULL && (info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX)) { senderr(EINVAL); } /* * Verify that the socket has the appropriate privilege; RTM_GET * is the only operation the non-superuser is allowed. */ if (kauth_authorize_network(so->so_cred, KAUTH_NETWORK_ROUTE, 0, rtm, NULL, NULL) != 0) senderr(EACCES); /* * route(8) passes a sockaddr truncated with prefixlen. * The kernel doesn't expect such sockaddr and need to * use a buffer that is big enough for the sockaddr expected * (padded with 0's). We keep the original length of the sockaddr. */ if (info.rti_info[RTAX_NETMASK]) { /* * Use the family of RTAX_DST, because RTAX_NETMASK * can have a zero family if it comes from the radix * tree via rt_mask(). */ socklen_t sa_len = sockaddr_getsize_by_family( info.rti_info[RTAX_DST]->sa_family); socklen_t masklen = sockaddr_getlen( info.rti_info[RTAX_NETMASK]); if (sa_len != 0 && sa_len > masklen) { KASSERT(sa_len <= sizeof(netmask)); memcpy(&netmask, info.rti_info[RTAX_NETMASK], masklen); memset((char *)&netmask + masklen, 0, sa_len - masklen); info.rti_info[RTAX_NETMASK] = sstocsa(&netmask); } } switch (rtm->rtm_type) { case RTM_ADD: if (info.rti_info[RTAX_GATEWAY] == NULL) { senderr(EINVAL); } #if defined(INET) || defined(INET6) /* support for new ARP/NDP code with keeping backcompat */ if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) { const struct sockaddr_dl *sdlp = satocsdl(info.rti_info[RTAX_GATEWAY]); /* Allow routing requests by interface index */ if (sdlp->sdl_nlen == 0 && sdlp->sdl_alen == 0 && sdlp->sdl_slen == 0) goto fallback; /* * Old arp binaries don't set the sdl_index * so we have to complement it. */ int sdl_index = sdlp->sdl_index; if (sdl_index == 0) { error = route_get_sdl_index(&info, &sdl_index); if (error != 0) goto fallback; } else if ( info.rti_info[RTAX_DST]->sa_family == AF_INET) { /* * XXX workaround for SIN_PROXY case; proxy arp * entry should be in an interface that has * a network route including the destination, * not a local (link) route that may not be a * desired place, for example a tap. */ const struct sockaddr_inarp *sina = (const struct sockaddr_inarp *) info.rti_info[RTAX_DST]; if (sina->sin_other & SIN_PROXY) { error = route_get_sdl_index(&info, &sdl_index); if (error != 0) goto fallback; } } error = lla_rt_output(rtm->rtm_type, rtm->rtm_flags, rtm->rtm_rmx.rmx_expire, &info, sdl_index); break; } fallback: #endif /* defined(INET) || defined(INET6) */ error = rtrequest1(rtm->rtm_type, &info, &saved_nrt); if (error == 0) { _rt_setmetrics(rtm->rtm_inits, rtm, saved_nrt); rt_unref(saved_nrt); } break; case RTM_DELETE: #if defined(INET) || defined(INET6) /* support for new ARP/NDP code */ if (info.rti_info[RTAX_GATEWAY] && (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) && (rtm->rtm_flags & RTF_LLDATA) != 0) { const struct sockaddr_dl *sdlp = satocsdl(info.rti_info[RTAX_GATEWAY]); error = lla_rt_output(rtm->rtm_type, rtm->rtm_flags, rtm->rtm_rmx.rmx_expire, &info, sdlp->sdl_index); rtm->rtm_flags &= ~RTF_UP; break; } #endif error = rtrequest1(rtm->rtm_type, &info, &saved_nrt); if (error != 0) break; rt = saved_nrt; do_rt_free = true; info.rti_info[RTAX_DST] = rt_getkey(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); info.rti_info[RTAX_TAG] = rt_gettag(rt); error = route_output_report(rt, &info, rtm, &new_rtm); if (error) senderr(error); if (new_rtm != NULL) { old_rtm = rtm; rtm = new_rtm; } break; case RTM_GET: case RTM_CHANGE: case RTM_LOCK: /* XXX This will mask info.rti_info[RTAX_DST] with * info.rti_info[RTAX_NETMASK] before * searching. It did not used to do that. --dyoung */ rt = NULL; error = rtrequest1(RTM_GET, &info, &rt); if (error != 0) senderr(error); if (rtm->rtm_type != RTM_GET) {/* XXX: too grotty */ if (memcmp(info.rti_info[RTAX_DST], rt_getkey(rt), info.rti_info[RTAX_DST]->sa_len) != 0) senderr(ESRCH); if (info.rti_info[RTAX_NETMASK] == NULL && rt_mask(rt) != NULL) senderr(ETOOMANYREFS); } /* * XXX if arp/ndp requests an L2 entry, we have to obtain * it from lltable while for the route command we have to * return a route as it is. How to distinguish them? * For newer arp/ndp, RTF_LLDATA flag set by arp/ndp * indicates an L2 entry is requested. For old arp/ndp * binaries, we check RTF_UP flag is NOT set; it works * by the fact that arp/ndp don't set it while the route * command sets it. */ if (((rtm->rtm_flags & RTF_LLDATA) != 0 || (rtm->rtm_flags & RTF_UP) == 0) && rtm->rtm_type == RTM_GET && sockaddr_cmp(rt_getkey(rt), info.rti_info[RTAX_DST]) != 0) { int ll_flags = 0; route_get_sdl(rt->rt_ifp, info.rti_info[RTAX_DST], &sdl, &ll_flags); info.rti_info[RTAX_GATEWAY] = sstocsa(&sdl); error = route_output_report(rt, &info, rtm, &new_rtm); if (error) senderr(error); if (new_rtm != NULL) { old_rtm = rtm; rtm = new_rtm; } rtm->rtm_flags |= RTF_LLDATA; rtm->rtm_flags &= ~RTF_CONNECTED; rtm->rtm_flags |= (ll_flags & LLE_STATIC) ? RTF_STATIC : 0; break; } switch (rtm->rtm_type) { case RTM_GET: info.rti_info[RTAX_DST] = rt_getkey(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); info.rti_info[RTAX_TAG] = rt_gettag(rt); error = route_output_report(rt, &info, rtm, &new_rtm); if (error) senderr(error); if (new_rtm != NULL) { old_rtm = rtm; rtm = new_rtm; } break; case RTM_CHANGE: #ifdef NET_MPSAFE /* * Release rt_so_mtx to avoid a deadlock with route_intr * and also serialize updating routes to avoid another. */ if (rt_updating) { /* Release to allow the updater to proceed */ rt_unref(rt); rt = NULL; } while (rt_updating) { error = cv_wait_sig(&rt_update_cv, rt_so_mtx); if (error != 0) goto flush; } if (rt == NULL) { error = rtrequest1(RTM_GET, &info, &rt); if (error != 0) goto flush; } rt_updating = true; mutex_exit(rt_so_mtx); error = rt_update_prepare(rt); if (error == 0) { error = rt_update(rt, &info, rtm); rt_update_finish(rt); } mutex_enter(rt_so_mtx); rt_updating = false; cv_broadcast(&rt_update_cv); #else error = rt_update(rt, &info, rtm); #endif if (error != 0) goto flush; /*FALLTHROUGH*/ case RTM_LOCK: rt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits); rt->rt_rmx.rmx_locks |= (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); break; } break; default: senderr(EOPNOTSUPP); } flush: if (rtm) { if (error) rtm->rtm_errno = error; else rtm->rtm_flags |= RTF_DONE; } family = info.rti_info[RTAX_DST] ? info.rti_info[RTAX_DST]->sa_family : 0; /* We cannot free old_rtm until we have stopped using the * pointers in info, some of which may point to sockaddrs * in old_rtm. */ if (old_rtm != NULL) Free(old_rtm); if (rt) { if (do_rt_free) { #ifdef NET_MPSAFE /* * Release rt_so_mtx to avoid a deadlock with * route_intr. */ mutex_exit(rt_so_mtx); rt_free(rt); mutex_enter(rt_so_mtx); #else rt_free(rt); #endif } else rt_unref(rt); } { struct rawcb *rp = NULL; /* * Check to see if we don't want our own messages. */ if ((so->so_options & SO_USELOOPBACK) == 0) { if (COMPATNAME(route_info).ri_cb.any_count <= 1) { if (rtm) Free(rtm); m_freem(m); goto out; } /* There is another listener, so construct message */ rp = sotorawcb(so); } if (rtm) { m_copyback(m, 0, rtm->rtm_msglen, rtm); if (m->m_pkthdr.len < rtm->rtm_msglen) { m_freem(m); m = NULL; } else if (m->m_pkthdr.len > rtm->rtm_msglen) m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len); Free(rtm); } if (rp) rp->rcb_proto.sp_family = 0; /* Avoid us */ if (family) proto.sp_protocol = family; if (m) raw_input(m, &proto, &COMPATNAME(route_info).ri_src, &COMPATNAME(route_info).ri_dst, &rt_rawcb); if (rp) rp->rcb_proto.sp_family = PF_XROUTE; } out: curlwp_bindx(bound); return error; } static int route_ctloutput(int op, struct socket *so, struct sockopt *sopt) { struct routecb *rop = sotoroutecb(so); int error = 0; unsigned char *rtm_type, *cp, *ep; size_t len; unsigned int msgfilter; struct sockaddr *sa; KASSERT(solocked(so)); if (sopt->sopt_level != AF_ROUTE) { error = ENOPROTOOPT; } else switch (op) { case PRCO_SETOPT: switch (sopt->sopt_name) { case RO_MSGFILTER: msgfilter = 0; for (rtm_type = sopt->sopt_data, len = sopt->sopt_size; len != 0; rtm_type++, len -= sizeof(*rtm_type)) { /* Guard against overflowing our storage. */ if (*rtm_type >= sizeof(msgfilter) * CHAR_BIT) { error = EOVERFLOW; break; } msgfilter |= RTMSGFILTER(*rtm_type); } if (error == 0) rop->rocb_msgfilter = msgfilter; break; case RO_MISSFILTER: /* Validate the data */ len = 0; cp = sopt->sopt_data; ep = cp + sopt->sopt_size; while (cp < ep) { if (ep - cp < offsetof(struct sockaddr, sa_len) + sizeof(sa->sa_len)) break; if (++len > RO_FILTSA_MAX) { error = ENOBUFS; break; } sa = (struct sockaddr *)cp; if (sa->sa_len < _SA_MINSIZE || sa->sa_len >sizeof(struct sockaddr_storage)) return EINVAL; cp += RT_XROUNDUP(sa->sa_len); } if (cp != ep) { if (error == 0) error = EINVAL; break; } if (rop->rocb_missfilterlen != 0) kmem_free(rop->rocb_missfilter, rop->rocb_missfilterlen); if (sopt->sopt_size != 0) { rop->rocb_missfilter = kmem_alloc(sopt->sopt_size, KM_SLEEP); if (rop->rocb_missfilter == NULL) { rop->rocb_missfilterlen = 0; error = ENOBUFS; break; } } else rop->rocb_missfilter = NULL; rop->rocb_missfilterlen = sopt->sopt_size; if (rop->rocb_missfilterlen != 0) memcpy(rop->rocb_missfilter, sopt->sopt_data, rop->rocb_missfilterlen); break; default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: switch (sopt->sopt_name) { case RO_MSGFILTER: error = ENOTSUP; break; default: error = ENOPROTOOPT; break; } } return error; } static void _rt_setmetrics(int which, const struct rt_xmsghdr *in, struct rtentry *out) { #define metric(f, e) if (which & (f)) out->rt_rmx.e = in->rtm_rmx.e; metric(RTV_RPIPE, rmx_recvpipe); metric(RTV_SPIPE, rmx_sendpipe); metric(RTV_SSTHRESH, rmx_ssthresh); metric(RTV_RTT, rmx_rtt); metric(RTV_RTTVAR, rmx_rttvar); metric(RTV_HOPCOUNT, rmx_hopcount); metric(RTV_MTU, rmx_mtu); #undef metric if (which & RTV_EXPIRE) { out->rt_rmx.rmx_expire = in->rtm_rmx.rmx_expire ? time_wall_to_mono(in->rtm_rmx.rmx_expire) : 0; } } static void rtm_setmetrics(const struct rtentry *in, struct rt_xmsghdr *out) { #define metric(e) out->rtm_rmx.e = in->rt_rmx.e; metric(rmx_recvpipe); metric(rmx_sendpipe); metric(rmx_ssthresh); metric(rmx_rtt); metric(rmx_rttvar); metric(rmx_hopcount); metric(rmx_mtu); metric(rmx_locks); #undef metric out->rtm_rmx.rmx_expire = in->rt_rmx.rmx_expire ? time_mono_to_wall(in->rt_rmx.rmx_expire) : 0; } static int rt_xaddrs(u_char rtmtype, const char *cp, const char *cplim, struct rt_addrinfo *rtinfo) { const struct sockaddr *sa = NULL; /* Quell compiler warning */ int i; for (i = 0; i < RTAX_MAX && cp < cplim; i++) { if ((rtinfo->rti_addrs & (1 << i)) == 0) continue; rtinfo->rti_info[i] = sa = (const struct sockaddr *)cp; RT_XADVANCE(cp, sa); } /* * Check for extra addresses specified, except RTM_GET asking * for interface info. */ if (rtmtype == RTM_GET) { if (((rtinfo->rti_addrs & (~((1 << RTAX_IFP) | (1 << RTAX_IFA)))) & (~0U << i)) != 0) return 1; } else if ((rtinfo->rti_addrs & (~0U << i)) != 0) return 1; /* Check for bad data length. */ if (cp != cplim) { if (i == RTAX_NETMASK + 1 && sa != NULL && cp - RT_XROUNDUP(sa->sa_len) + sa->sa_len == cplim) /* * The last sockaddr was info.rti_info[RTAX_NETMASK]. * We accept this for now for the sake of old * binaries or third party softwares. */ ; else return 1; } return 0; } static int rt_getlen(int type) { RTS_CTASSERT(__alignof(struct ifa_msghdr) >= sizeof(uint64_t)); RTS_CTASSERT(__alignof(struct if_msghdr) >= sizeof(uint64_t)); RTS_CTASSERT(__alignof(struct if_announcemsghdr) >= sizeof(uint64_t)); RTS_CTASSERT(__alignof(struct rt_msghdr) >= sizeof(uint64_t)); switch (type) { case RTM_ODELADDR: case RTM_ONEWADDR: case RTM_OCHGADDR: if (rtsock_iflist_70_hook.hooked) return sizeof(struct ifa_msghdr70); else { #ifdef RTSOCK_DEBUG printf("%s: unsupported RTM type %d\n", __func__, type); #endif return -1; } case RTM_DELADDR: case RTM_NEWADDR: case RTM_CHGADDR: return sizeof(struct ifa_xmsghdr); case RTM_OOIFINFO: if (rtsock_iflist_14_hook.hooked) return sizeof(struct if_msghdr14); else { #ifdef RTSOCK_DEBUG printf("%s: unsupported RTM type RTM_OOIFINFO\n", __func__); #endif return -1; } case RTM_OIFINFO: if (rtsock_iflist_50_hook.hooked) return sizeof(struct if_msghdr50); else { #ifdef RTSOCK_DEBUG printf("%s: unsupported RTM type RTM_OIFINFO\n", __func__); #endif return -1; } case RTM_IFINFO: return sizeof(struct if_xmsghdr); case RTM_IFANNOUNCE: case RTM_IEEE80211: return sizeof(struct if_xannouncemsghdr); default: return sizeof(struct rt_xmsghdr); } } struct mbuf * COMPATNAME(rt_msg1)(int type, struct rt_addrinfo *rtinfo, void *data, int datalen) { struct rt_xmsghdr *rtm; struct mbuf *m; int i; const struct sockaddr *sa; int len, dlen; m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) return m; MCLAIM(m, &COMPATNAME(routedomain).dom_mowner); if ((len = rt_getlen(type)) == -1) goto out; if (len > MHLEN + MLEN) panic("%s: message too long", __func__); else if (len > MHLEN) { m->m_next = m_get(M_DONTWAIT, MT_DATA); if (m->m_next == NULL) goto out; MCLAIM(m->m_next, m->m_owner); m->m_pkthdr.len = len; m->m_len = MHLEN; m->m_next->m_len = len - MHLEN; } else { m->m_pkthdr.len = m->m_len = len; } m_reset_rcvif(m); m_copyback(m, 0, datalen, data); if (len > datalen) (void)memset(mtod(m, char *) + datalen, 0, len - datalen); rtm = mtod(m, struct rt_xmsghdr *); for (i = 0; i < RTAX_MAX; i++) { if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); dlen = RT_XROUNDUP(sa->sa_len); m_copyback(m, len, sa->sa_len, sa); if (dlen != sa->sa_len) { /* * Up to 7 + 1 nul's since roundup is to * sizeof(uint64_t) (8 bytes) */ m_copyback(m, len + sa->sa_len, dlen - sa->sa_len, "\0\0\0\0\0\0\0"); } len += dlen; } if (m->m_pkthdr.len != len) goto out; rtm->rtm_msglen = len; rtm->rtm_version = RTM_XVERSION; rtm->rtm_type = type; return m; out: m_freem(m); return NULL; } /* * rt_msg2 * * fills 'cp' or 'w'.w_tmem with the routing socket message and * returns the length of the message in 'lenp'. * * if walkarg is 0, cp is expected to be 0 or a buffer large enough to hold * the message * otherwise walkarg's w_needed is updated and if the user buffer is * specified and w_needed indicates space exists the information is copied * into the temp space (w_tmem). w_tmem is [re]allocated if necessary, * if the allocation fails ENOBUFS is returned. */ static int rt_msg2(int type, struct rt_addrinfo *rtinfo, void *cpv, struct rt_walkarg *w, int *lenp) { int i; int len, dlen, second_time = 0; char *cp0, *cp = cpv; rtinfo->rti_addrs = 0; again: if ((len = rt_getlen(type)) == -1) return EINVAL; if ((cp0 = cp) != NULL) cp += len; for (i = 0; i < RTAX_MAX; i++) { const struct sockaddr *sa; if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); dlen = RT_XROUNDUP(sa->sa_len); if (cp) { int diff = dlen - sa->sa_len; (void)memcpy(cp, sa, (size_t)sa->sa_len); cp += sa->sa_len; if (diff > 0) { (void)memset(cp, 0, (size_t)diff); cp += diff; } } len += dlen; } if (cp == NULL && w != NULL && !second_time) { struct rt_walkarg *rw = w; rw->w_needed += len; if (rw->w_needed <= 0 && rw->w_where) { if (rw->w_tmemsize < len) { if (rw->w_tmem) kmem_free(rw->w_tmem, rw->w_tmemsize); rw->w_tmem = kmem_zalloc(len, KM_SLEEP); rw->w_tmemsize = len; } if (rw->w_tmem) { cp = rw->w_tmem; second_time = 1; goto again; } else { rw->w_tmemneeded = len; return ENOBUFS; } } } if (cp) { struct rt_xmsghdr *rtm = (struct rt_xmsghdr *)cp0; rtm->rtm_version = RTM_XVERSION; rtm->rtm_type = type; rtm->rtm_msglen = len; } if (lenp) *lenp = len; return 0; } /* * This routine is called to generate a message from the routing * socket indicating that a redirect has occurred, a routing lookup * has failed, or that a protocol has detected timeouts to a particular * destination. */ void COMPATNAME(rt_missmsg)(int type, const struct rt_addrinfo *rtinfo, int flags, int error) { struct rt_xmsghdr rtm; struct mbuf *m; const struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; struct rt_addrinfo info = *rtinfo; COMPATCALL(rt_missmsg, (type, rtinfo, flags, error)); if (COMPATNAME(route_info).ri_cb.any_count == 0) return; memset(&rtm, 0, sizeof(rtm)); rtm.rtm_pid = curproc->p_pid; rtm.rtm_flags = RTF_DONE | flags; rtm.rtm_errno = error; m = COMPATNAME(rt_msg1)(type, &info, &rtm, sizeof(rtm)); if (m == NULL) return; mtod(m, struct rt_xmsghdr *)->rtm_addrs = info.rti_addrs; COMPATNAME(route_enqueue)(m, sa ? sa->sa_family : 0); } /* * This routine is called to generate a message from the routing * socket indicating that the status of a network interface has changed. */ void COMPATNAME(rt_ifmsg)(struct ifnet *ifp) { struct if_xmsghdr ifm; struct mbuf *m; struct rt_addrinfo info; COMPATCALL(rt_ifmsg, (ifp)); if (COMPATNAME(route_info).ri_cb.any_count == 0) return; (void)memset(&info, 0, sizeof(info)); (void)memset(&ifm, 0, sizeof(ifm)); ifm.ifm_index = ifp->if_index; ifm.ifm_flags = ifp->if_flags; if_export_if_data(ifp, &ifm.ifm_data, false); ifm.ifm_addrs = 0; m = COMPATNAME(rt_msg1)(RTM_IFINFO, &info, &ifm, sizeof(ifm)); if (m == NULL) return; COMPATNAME(route_enqueue)(m, 0); MODULE_HOOK_CALL_VOID(rtsock_oifmsg_14_hook, (ifp), __nothing); MODULE_HOOK_CALL_VOID(rtsock_oifmsg_50_hook, (ifp), __nothing); } /* * This is called to generate messages from the routing socket * indicating a network interface has had addresses associated with it. * if we ever reverse the logic and replace messages TO the routing * socket indicate a request to configure interfaces, then it will * be unnecessary as the routing socket will automatically generate * copies of it. */ static void COMPATNAME(rt_addrmsg0)(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt, const struct sockaddr *src) { #define cmdpass(__cmd, __pass) (((__cmd) << 2) | (__pass)) struct rt_addrinfo info; const struct sockaddr *sa; int pass; struct mbuf *m; struct ifnet *ifp; struct rt_xmsghdr rtm; struct ifa_xmsghdr ifam; int ncmd; KASSERT(ifa != NULL); KASSERT(ifa->ifa_addr != NULL); ifp = ifa->ifa_ifp; if (cmd == RTM_ADD && vec_sctp_add_ip_address != NULL) { (*vec_sctp_add_ip_address)(ifa); } else if (cmd == RTM_DELETE && vec_sctp_delete_ip_address != NULL) { (*vec_sctp_delete_ip_address)(ifa); } COMPATCALL(rt_addrmsg_rt, (cmd, ifa, error, rt)); if (COMPATNAME(route_info).ri_cb.any_count == 0) return; for (pass = 1; pass < 3; pass++) { memset(&info, 0, sizeof(info)); switch (cmdpass(cmd, pass)) { case cmdpass(RTM_ADD, 1): case cmdpass(RTM_CHANGE, 1): case cmdpass(RTM_DELETE, 2): case cmdpass(RTM_NEWADDR, 1): case cmdpass(RTM_DELADDR, 1): case cmdpass(RTM_CHGADDR, 1): switch (cmd) { case RTM_ADD: ncmd = RTM_XNEWADDR; break; case RTM_DELETE: ncmd = RTM_XDELADDR; break; case RTM_CHANGE: ncmd = RTM_XCHGADDR; break; case RTM_NEWADDR: ncmd = RTM_XNEWADDR; break; case RTM_DELADDR: ncmd = RTM_XDELADDR; break; case RTM_CHGADDR: ncmd = RTM_XCHGADDR; break; default: panic("%s: unknown command %d", __func__, cmd); } MODULE_HOOK_CALL_VOID(rtsock_newaddr_70_hook, (ncmd, ifa), __nothing); info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr; KASSERT(ifp->if_dl != NULL); info.rti_info[RTAX_IFP] = ifp->if_dl->ifa_addr; info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; info.rti_info[RTAX_AUTHOR] = src; memset(&ifam, 0, sizeof(ifam)); ifam.ifam_index = ifp->if_index; ifam.ifam_metric = ifa->ifa_metric; ifam.ifam_flags = ifa->ifa_flags; #ifndef COMPAT_RTSOCK ifam.ifam_pid = curproc->p_pid; ifam.ifam_addrflags = if_addrflags(ifa); #endif m = COMPATNAME(rt_msg1)(ncmd, &info, &ifam, sizeof(ifam)); if (m == NULL) continue; mtod(m, struct ifa_xmsghdr *)->ifam_addrs = info.rti_addrs; break; case cmdpass(RTM_ADD, 2): case cmdpass(RTM_CHANGE, 2): case cmdpass(RTM_DELETE, 1): if (rt == NULL) continue; info.rti_info[RTAX_NETMASK] = rt_mask(rt); info.rti_info[RTAX_DST] = sa = rt_getkey(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; memset(&rtm, 0, sizeof(rtm)); rtm.rtm_pid = curproc->p_pid; rtm.rtm_index = ifp->if_index; rtm.rtm_flags |= rt->rt_flags; rtm.rtm_errno = error; m = COMPATNAME(rt_msg1)(cmd, &info, &rtm, sizeof(rtm)); if (m == NULL) continue; mtod(m, struct rt_xmsghdr *)->rtm_addrs = info.rti_addrs; break; default: continue; } KASSERTMSG(m != NULL, "called with wrong command"); COMPATNAME(route_enqueue)(m, sa ? sa->sa_family : 0); } #undef cmdpass } void COMPATNAME(rt_addrmsg)(int cmd, struct ifaddr *ifa) { COMPATNAME(rt_addrmsg0)(cmd, ifa, 0, NULL, NULL); } void COMPATNAME(rt_addrmsg_rt)(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) { COMPATNAME(rt_addrmsg0)(cmd, ifa, error, rt, NULL); } void COMPATNAME(rt_addrmsg_src)(int cmd, struct ifaddr *ifa, const struct sockaddr *src) { COMPATNAME(rt_addrmsg0)(cmd, ifa, 0, NULL, src); } static struct mbuf * rt_makeifannouncemsg(struct ifnet *ifp, int type, int what, struct rt_addrinfo *info) { struct if_xannouncemsghdr ifan; memset(info, 0, sizeof(*info)); memset(&ifan, 0, sizeof(ifan)); ifan.ifan_index = ifp->if_index; strlcpy(ifan.ifan_name, ifp->if_xname, sizeof(ifan.ifan_name)); ifan.ifan_what = what; return COMPATNAME(rt_msg1)(type, info, &ifan, sizeof(ifan)); } /* * This is called to generate routing socket messages indicating * network interface arrival and departure. */ void COMPATNAME(rt_ifannouncemsg)(struct ifnet *ifp, int what) { struct mbuf *m; struct rt_addrinfo info; COMPATCALL(rt_ifannouncemsg, (ifp, what)); if (COMPATNAME(route_info).ri_cb.any_count == 0) return; m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info); if (m == NULL) return; COMPATNAME(route_enqueue)(m, 0); } /* * This is called to generate routing socket messages indicating * IEEE80211 wireless events. * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way. */ void COMPATNAME(rt_ieee80211msg)(struct ifnet *ifp, int what, void *data, size_t data_len) { struct mbuf *m; struct rt_addrinfo info; COMPATCALL(rt_ieee80211msg, (ifp, what, data, data_len)); if (COMPATNAME(route_info).ri_cb.any_count == 0) return; m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info); if (m == NULL) return; /* * Append the ieee80211 data. Try to stick it in the * mbuf containing the ifannounce msg; otherwise allocate * a new mbuf and append. * * NB: we assume m is a single mbuf. */ if (data_len > M_TRAILINGSPACE(m)) { struct mbuf *n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } (void)memcpy(mtod(n, void *), data, data_len); n->m_len = data_len; m->m_next = n; } else if (data_len > 0) { (void)memcpy(mtod(m, uint8_t *) + m->m_len, data, data_len); m->m_len += data_len; } if (m->m_flags & M_PKTHDR) m->m_pkthdr.len += data_len; mtod(m, struct if_xannouncemsghdr *)->ifan_msglen += data_len; COMPATNAME(route_enqueue)(m, 0); } /* * Routing message software interrupt routine */ static void COMPATNAME(route_intr)(void *cookie) { struct sockproto proto = { .sp_family = PF_XROUTE, }; struct route_info * const ri = &COMPATNAME(route_info); struct mbuf *m; SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE(); for (;;) { IFQ_LOCK(&ri->ri_intrq); IF_DEQUEUE(&ri->ri_intrq, m); IFQ_UNLOCK(&ri->ri_intrq); if (m == NULL) break; proto.sp_protocol = M_GETCTX(m, uintptr_t); #ifdef NET_MPSAFE mutex_enter(rt_so_mtx); #endif raw_input(m, &proto, &ri->ri_src, &ri->ri_dst, &rt_rawcb); #ifdef NET_MPSAFE mutex_exit(rt_so_mtx); #endif } SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); } /* * Enqueue a message to the software interrupt routine. */ void COMPATNAME(route_enqueue)(struct mbuf *m, int family) { struct route_info * const ri = &COMPATNAME(route_info); int wasempty; IFQ_LOCK(&ri->ri_intrq); if (IF_QFULL(&ri->ri_intrq)) { printf("%s: queue full, dropped message\n", __func__); IF_DROP(&ri->ri_intrq); IFQ_UNLOCK(&ri->ri_intrq); m_freem(m); } else { wasempty = IF_IS_EMPTY(&ri->ri_intrq); M_SETCTX(m, (uintptr_t)family); IF_ENQUEUE(&ri->ri_intrq, m); IFQ_UNLOCK(&ri->ri_intrq); if (wasempty) { kpreempt_disable(); softint_schedule(ri->ri_sih); kpreempt_enable(); } } } static void COMPATNAME(route_init)(void) { struct route_info * const ri = &COMPATNAME(route_info); #ifndef COMPAT_RTSOCK rt_init(); #ifdef NET_MPSAFE rt_so_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); cv_init(&rt_update_cv, "rtsock_cv"); #endif sysctl_net_route_setup(NULL, PF_ROUTE, "rtable"); #endif ri->ri_intrq.ifq_maxlen = ri->ri_maxqlen; ri->ri_sih = softint_establish(SOFTINT_NET | SOFTINT_MPSAFE, COMPATNAME(route_intr), NULL); IFQ_LOCK_INIT(&ri->ri_intrq); #ifdef MBUFTRACE MOWNER_ATTACH(&COMPATNAME(routedomain).dom_mowner); #endif } /* * Definitions of protocols supported in the ROUTE domain. */ #ifndef COMPAT_RTSOCK PR_WRAP_USRREQS(route); #else PR_WRAP_USRREQS(compat_50_route); #endif static const struct pr_usrreqs route_usrreqs = { .pr_attach = COMPATNAME(route_attach_wrapper), .pr_detach = COMPATNAME(route_detach_wrapper), .pr_accept = COMPATNAME(route_accept_wrapper), .pr_bind = COMPATNAME(route_bind_wrapper), .pr_listen = COMPATNAME(route_listen_wrapper), .pr_connect = COMPATNAME(route_connect_wrapper), .pr_connect2 = COMPATNAME(route_connect2_wrapper), .pr_disconnect = COMPATNAME(route_disconnect_wrapper), .pr_shutdown = COMPATNAME(route_shutdown_wrapper), .pr_abort = COMPATNAME(route_abort_wrapper), .pr_ioctl = COMPATNAME(route_ioctl_wrapper), .pr_stat = COMPATNAME(route_stat_wrapper), .pr_peeraddr = COMPATNAME(route_peeraddr_wrapper), .pr_sockaddr = COMPATNAME(route_sockaddr_wrapper), .pr_rcvd = COMPATNAME(route_rcvd_wrapper), .pr_recvoob = COMPATNAME(route_recvoob_wrapper), .pr_send = COMPATNAME(route_send_wrapper), .pr_sendoob = COMPATNAME(route_sendoob_wrapper), .pr_purgeif = COMPATNAME(route_purgeif_wrapper), }; static const struct protosw COMPATNAME(route_protosw)[] = { { .pr_type = SOCK_RAW, .pr_domain = &COMPATNAME(routedomain), .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctlinput = raw_ctlinput, .pr_ctloutput = route_ctloutput, .pr_usrreqs = &route_usrreqs, .pr_init = rt_pr_init, }, }; struct domain COMPATNAME(routedomain) = { .dom_family = PF_XROUTE, .dom_name = DOMAINNAME, .dom_init = COMPATNAME(route_init), .dom_protosw = COMPATNAME(route_protosw), .dom_protoswNPROTOSW = &COMPATNAME(route_protosw)[__arraycount(COMPATNAME(route_protosw))], #ifdef MBUFTRACE .dom_mowner = MOWNER_INIT("route", "rtm"), #endif };
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 /* $NetBSD: prop_dictionary.c,v 1.46 2023/06/14 00:35:18 rin Exp $ */ /*- * Copyright (c) 2006, 2007, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "prop_object_impl.h" #include <prop/prop_array.h> #include <prop/prop_dictionary.h> #include <prop/prop_string.h> #include <sys/rbtree.h> #if !defined(_KERNEL) && !defined(_STANDALONE) #include <errno.h> #endif /* * We implement these like arrays, but we keep them sorted by key. * This allows us to binary-search as well as keep externalized output * sane-looking for human eyes. */ #define EXPAND_STEP 16 /* * prop_dictionary_keysym_t is allocated with space at the end to hold the * key. This must be a regular object so that we can maintain sane iterator * semantics -- we don't want to require that the caller release the result * of prop_object_iterator_next(). * * We'd like to have some small'ish keysym objects for up-to-16 characters * in a key, some for up-to-32 characters in a key, and then a final bucket * for up-to-128 characters in a key (not including NUL). Keys longer than * 128 characters are not allowed. */ struct _prop_dictionary_keysym { struct _prop_object pdk_obj; size_t pdk_size; struct rb_node pdk_link; char pdk_key[1]; /* actually variable length */ }; /* pdk_key[1] takes care of the NUL */ #define PDK_SIZE_16 (sizeof(struct _prop_dictionary_keysym) + 16) #define PDK_SIZE_32 (sizeof(struct _prop_dictionary_keysym) + 32) #define PDK_SIZE_128 (sizeof(struct _prop_dictionary_keysym) + 128) #define PDK_MAXKEY 128 _PROP_POOL_INIT(_prop_dictionary_keysym16_pool, PDK_SIZE_16, "pdict16") _PROP_POOL_INIT(_prop_dictionary_keysym32_pool, PDK_SIZE_32, "pdict32") _PROP_POOL_INIT(_prop_dictionary_keysym128_pool, PDK_SIZE_128, "pdict128") struct _prop_dict_entry { prop_dictionary_keysym_t pde_key; prop_object_t pde_objref; }; struct _prop_dictionary { struct _prop_object pd_obj; _PROP_RWLOCK_DECL(pd_rwlock) struct _prop_dict_entry *pd_array; unsigned int pd_capacity; unsigned int pd_count; int pd_flags; uint32_t pd_version; }; #define PD_F_IMMUTABLE 0x01 /* dictionary is immutable */ _PROP_POOL_INIT(_prop_dictionary_pool, sizeof(struct _prop_dictionary), "propdict") _PROP_MALLOC_DEFINE(M_PROP_DICT, "prop dictionary", "property dictionary container object") static _prop_object_free_rv_t _prop_dictionary_free(prop_stack_t, prop_object_t *); static void _prop_dictionary_emergency_free(prop_object_t); static bool _prop_dictionary_externalize( struct _prop_object_externalize_context *, void *); static _prop_object_equals_rv_t _prop_dictionary_equals(prop_object_t, prop_object_t, void **, void **, prop_object_t *, prop_object_t *); static void _prop_dictionary_equals_finish(prop_object_t, prop_object_t); static prop_object_iterator_t _prop_dictionary_iterator_locked(prop_dictionary_t); static prop_object_t _prop_dictionary_iterator_next_object_locked(void *); static prop_object_t _prop_dictionary_get_keysym(prop_dictionary_t, prop_dictionary_keysym_t, bool); static prop_object_t _prop_dictionary_get(prop_dictionary_t, const char *, bool); static void _prop_dictionary_lock(void); static void _prop_dictionary_unlock(void); static const struct _prop_object_type _prop_object_type_dictionary = { .pot_type = PROP_TYPE_DICTIONARY, .pot_free = _prop_dictionary_free, .pot_emergency_free = _prop_dictionary_emergency_free, .pot_extern = _prop_dictionary_externalize, .pot_equals = _prop_dictionary_equals, .pot_equals_finish = _prop_dictionary_equals_finish, .pot_lock = _prop_dictionary_lock, .pot_unlock = _prop_dictionary_unlock, }; static _prop_object_free_rv_t _prop_dict_keysym_free(prop_stack_t, prop_object_t *); static bool _prop_dict_keysym_externalize( struct _prop_object_externalize_context *, void *); static _prop_object_equals_rv_t _prop_dict_keysym_equals(prop_object_t, prop_object_t, void **, void **, prop_object_t *, prop_object_t *); static const struct _prop_object_type _prop_object_type_dict_keysym = { .pot_type = PROP_TYPE_DICT_KEYSYM, .pot_free = _prop_dict_keysym_free, .pot_extern = _prop_dict_keysym_externalize, .pot_equals = _prop_dict_keysym_equals, }; #define prop_object_is_dictionary(x) \ ((x) != NULL && (x)->pd_obj.po_type == &_prop_object_type_dictionary) #define prop_object_is_dictionary_keysym(x) \ ((x) != NULL && (x)->pdk_obj.po_type == &_prop_object_type_dict_keysym) #define prop_dictionary_is_immutable(x) \ (((x)->pd_flags & PD_F_IMMUTABLE) != 0) struct _prop_dictionary_iterator { struct _prop_object_iterator pdi_base; unsigned int pdi_index; }; /* * Dictionary key symbols are immutable, and we are likely to have many * duplicated key symbols. So, to save memory, we unique'ify key symbols * so we only have to have one copy of each string. */ static int /*ARGSUSED*/ _prop_dict_keysym_rb_compare_nodes(void *ctx _PROP_ARG_UNUSED, const void *n1, const void *n2) { const struct _prop_dictionary_keysym *pdk1 = n1; const struct _prop_dictionary_keysym *pdk2 = n2; return strcmp(pdk1->pdk_key, pdk2->pdk_key); } static int /*ARGSUSED*/ _prop_dict_keysym_rb_compare_key(void *ctx _PROP_ARG_UNUSED, const void *n, const void *v) { const struct _prop_dictionary_keysym *pdk = n; const char *cp = v; return strcmp(pdk->pdk_key, cp); } static const rb_tree_ops_t _prop_dict_keysym_rb_tree_ops = { .rbto_compare_nodes = _prop_dict_keysym_rb_compare_nodes, .rbto_compare_key = _prop_dict_keysym_rb_compare_key, .rbto_node_offset = offsetof(struct _prop_dictionary_keysym, pdk_link), .rbto_context = NULL }; static struct rb_tree _prop_dict_keysym_tree; _PROP_ONCE_DECL(_prop_dict_init_once) _PROP_MUTEX_DECL_STATIC(_prop_dict_keysym_tree_mutex) static int _prop_dict_init(void) { _PROP_MUTEX_INIT(_prop_dict_keysym_tree_mutex); rb_tree_init(&_prop_dict_keysym_tree, &_prop_dict_keysym_rb_tree_ops); return 0; } static void _prop_dict_keysym_put(prop_dictionary_keysym_t pdk) { if (pdk->pdk_size <= PDK_SIZE_16) _PROP_POOL_PUT(_prop_dictionary_keysym16_pool, pdk); else if (pdk->pdk_size <= PDK_SIZE_32) _PROP_POOL_PUT(_prop_dictionary_keysym32_pool, pdk); else { _PROP_ASSERT(pdk->pdk_size <= PDK_SIZE_128); _PROP_POOL_PUT(_prop_dictionary_keysym128_pool, pdk); } } /* ARGSUSED */ static _prop_object_free_rv_t _prop_dict_keysym_free(prop_stack_t stack, prop_object_t *obj) { prop_dictionary_keysym_t pdk = *obj; rb_tree_remove_node(&_prop_dict_keysym_tree, pdk); _prop_dict_keysym_put(pdk); return _PROP_OBJECT_FREE_DONE; } static bool _prop_dict_keysym_externalize(struct _prop_object_externalize_context *ctx, void *v) { prop_dictionary_keysym_t pdk = v; /* We externalize these as strings, and they're never empty. */ _PROP_ASSERT(pdk->pdk_key[0] != '\0'); if (_prop_object_externalize_start_tag(ctx, "string") == false || _prop_object_externalize_append_encoded_cstring(ctx, pdk->pdk_key) == false || _prop_object_externalize_end_tag(ctx, "string") == false) return (false); return (true); } /* ARGSUSED */ static _prop_object_equals_rv_t _prop_dict_keysym_equals(prop_object_t v1, prop_object_t v2, void **stored_pointer1, void **stored_pointer2, prop_object_t *next_obj1, prop_object_t *next_obj2) { prop_dictionary_keysym_t pdk1 = v1; prop_dictionary_keysym_t pdk2 = v2; /* * There is only ever one copy of a keysym at any given time, * so we can reduce this to a simple pointer equality check. */ if (pdk1 == pdk2) return _PROP_OBJECT_EQUALS_TRUE; else return _PROP_OBJECT_EQUALS_FALSE; } static prop_dictionary_keysym_t _prop_dict_keysym_alloc(const char *key) { prop_dictionary_keysym_t opdk, pdk, rpdk; size_t size; _PROP_ONCE_RUN(_prop_dict_init_once, _prop_dict_init); /* * Check to see if this already exists in the tree. If it does, * we just retain it and return it. */ _PROP_MUTEX_LOCK(_prop_dict_keysym_tree_mutex); opdk = rb_tree_find_node(&_prop_dict_keysym_tree, key); if (opdk != NULL) { prop_object_retain(opdk); _PROP_MUTEX_UNLOCK(_prop_dict_keysym_tree_mutex); return (opdk); } _PROP_MUTEX_UNLOCK(_prop_dict_keysym_tree_mutex); /* * Not in the tree. Create it now. */ size = sizeof(*pdk) + strlen(key) /* pdk_key[1] covers the NUL */; if (size <= PDK_SIZE_16) pdk = _PROP_POOL_GET(_prop_dictionary_keysym16_pool); else if (size <= PDK_SIZE_32) pdk = _PROP_POOL_GET(_prop_dictionary_keysym32_pool); else if (size <= PDK_SIZE_128) pdk = _PROP_POOL_GET(_prop_dictionary_keysym128_pool); else pdk = NULL; /* key too long */ if (pdk == NULL) return (NULL); _prop_object_init(&pdk->pdk_obj, &_prop_object_type_dict_keysym); strcpy(pdk->pdk_key, key); pdk->pdk_size = size; /* * We dropped the mutex when we allocated the new object, so * we have to check again if it is in the tree. */ _PROP_MUTEX_LOCK(_prop_dict_keysym_tree_mutex); opdk = rb_tree_find_node(&_prop_dict_keysym_tree, key); if (opdk != NULL) { prop_object_retain(opdk); _PROP_MUTEX_UNLOCK(_prop_dict_keysym_tree_mutex); _prop_dict_keysym_put(pdk); return (opdk); } rpdk = rb_tree_insert_node(&_prop_dict_keysym_tree, pdk); _PROP_ASSERT(rpdk == pdk); _PROP_MUTEX_UNLOCK(_prop_dict_keysym_tree_mutex); return (rpdk); } static _prop_object_free_rv_t _prop_dictionary_free(prop_stack_t stack, prop_object_t *obj) { prop_dictionary_t pd = *obj; prop_dictionary_keysym_t pdk; prop_object_t po; _PROP_ASSERT(pd->pd_count <= pd->pd_capacity); _PROP_ASSERT((pd->pd_capacity == 0 && pd->pd_array == NULL) || (pd->pd_capacity != 0 && pd->pd_array != NULL)); /* The empty dictorinary is easy, handle that first. */ if (pd->pd_count == 0) { if (pd->pd_array != NULL) _PROP_FREE(pd->pd_array, M_PROP_DICT); _PROP_RWLOCK_DESTROY(pd->pd_rwlock); _PROP_POOL_PUT(_prop_dictionary_pool, pd); return (_PROP_OBJECT_FREE_DONE); } po = pd->pd_array[pd->pd_count - 1].pde_objref; _PROP_ASSERT(po != NULL); if (stack == NULL) { /* * If we are in emergency release mode, * just let caller recurse down. */ *obj = po; return (_PROP_OBJECT_FREE_FAILED); } /* Otherwise, try to push the current object on the stack. */ if (!_prop_stack_push(stack, pd, NULL, NULL, NULL)) { /* Push failed, entering emergency release mode. */ return (_PROP_OBJECT_FREE_FAILED); } /* Object pushed on stack, caller will release it. */ --pd->pd_count; pdk = pd->pd_array[pd->pd_count].pde_key; _PROP_ASSERT(pdk != NULL); prop_object_release(pdk); *obj = po; return (_PROP_OBJECT_FREE_RECURSE); } static void _prop_dictionary_lock(void) { /* XXX: once necessary or paranoia? */ _PROP_ONCE_RUN(_prop_dict_init_once, _prop_dict_init); _PROP_MUTEX_LOCK(_prop_dict_keysym_tree_mutex); } static void _prop_dictionary_unlock(void) { _PROP_MUTEX_UNLOCK(_prop_dict_keysym_tree_mutex); } static void _prop_dictionary_emergency_free(prop_object_t obj) { prop_dictionary_t pd = obj; prop_dictionary_keysym_t pdk; _PROP_ASSERT(pd->pd_count != 0); --pd->pd_count; pdk = pd->pd_array[pd->pd_count].pde_key; _PROP_ASSERT(pdk != NULL); prop_object_release(pdk); } static bool _prop_dictionary_externalize(struct _prop_object_externalize_context *ctx, void *v) { prop_dictionary_t pd = v; prop_dictionary_keysym_t pdk; struct _prop_object *po; prop_object_iterator_t pi; unsigned int i; bool rv = false; _PROP_RWLOCK_RDLOCK(pd->pd_rwlock); if (pd->pd_count == 0) { _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); return (_prop_object_externalize_empty_tag(ctx, "dict")); } if (_prop_object_externalize_start_tag(ctx, "dict") == false || _prop_object_externalize_append_char(ctx, '\n') == false) goto out; pi = _prop_dictionary_iterator_locked(pd); if (pi == NULL) goto out; ctx->poec_depth++; _PROP_ASSERT(ctx->poec_depth != 0); while ((pdk = _prop_dictionary_iterator_next_object_locked(pi)) != NULL) { po = _prop_dictionary_get_keysym(pd, pdk, true); if (po == NULL || _prop_object_externalize_start_tag(ctx, "key") == false || _prop_object_externalize_append_encoded_cstring(ctx, pdk->pdk_key) == false || _prop_object_externalize_end_tag(ctx, "key") == false || (*po->po_type->pot_extern)(ctx, po) == false) { prop_object_iterator_release(pi); goto out; } } prop_object_iterator_release(pi); ctx->poec_depth--; for (i = 0; i < ctx->poec_depth; i++) { if (_prop_object_externalize_append_char(ctx, '\t') == false) goto out; } if (_prop_object_externalize_end_tag(ctx, "dict") == false) goto out; rv = true; out: _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); return (rv); } /* ARGSUSED */ static _prop_object_equals_rv_t _prop_dictionary_equals(prop_object_t v1, prop_object_t v2, void **stored_pointer1, void **stored_pointer2, prop_object_t *next_obj1, prop_object_t *next_obj2) { prop_dictionary_t dict1 = v1; prop_dictionary_t dict2 = v2; uintptr_t idx; _prop_object_equals_rv_t rv = _PROP_OBJECT_EQUALS_FALSE; if (dict1 == dict2) return (_PROP_OBJECT_EQUALS_TRUE); _PROP_ASSERT(*stored_pointer1 == *stored_pointer2); idx = (uintptr_t)*stored_pointer1; if (idx == 0) { if ((uintptr_t)dict1 < (uintptr_t)dict2) { _PROP_RWLOCK_RDLOCK(dict1->pd_rwlock); _PROP_RWLOCK_RDLOCK(dict2->pd_rwlock); } else { _PROP_RWLOCK_RDLOCK(dict2->pd_rwlock); _PROP_RWLOCK_RDLOCK(dict1->pd_rwlock); } } if (dict1->pd_count != dict2->pd_count) goto out; if (idx == dict1->pd_count) { rv = _PROP_OBJECT_EQUALS_TRUE; goto out; } _PROP_ASSERT(idx < dict1->pd_count); *stored_pointer1 = (void *)(idx + 1); *stored_pointer2 = (void *)(idx + 1); *next_obj1 = dict1->pd_array[idx].pde_objref; *next_obj2 = dict2->pd_array[idx].pde_objref; if (!prop_dictionary_keysym_equals(dict1->pd_array[idx].pde_key, dict2->pd_array[idx].pde_key)) goto out; return (_PROP_OBJECT_EQUALS_RECURSE); out: _PROP_RWLOCK_UNLOCK(dict1->pd_rwlock); _PROP_RWLOCK_UNLOCK(dict2->pd_rwlock); return (rv); } static void _prop_dictionary_equals_finish(prop_object_t v1, prop_object_t v2) { _PROP_RWLOCK_UNLOCK(((prop_dictionary_t)v1)->pd_rwlock); _PROP_RWLOCK_UNLOCK(((prop_dictionary_t)v2)->pd_rwlock); } static prop_dictionary_t _prop_dictionary_alloc(unsigned int capacity) { prop_dictionary_t pd; struct _prop_dict_entry *array; if (capacity != 0) { array = _PROP_CALLOC(capacity * sizeof(*array), M_PROP_DICT); if (array == NULL) return (NULL); } else array = NULL; pd = _PROP_POOL_GET(_prop_dictionary_pool); if (pd != NULL) { _prop_object_init(&pd->pd_obj, &_prop_object_type_dictionary); _PROP_RWLOCK_INIT(pd->pd_rwlock); pd->pd_array = array; pd->pd_capacity = capacity; pd->pd_count = 0; pd->pd_flags = 0; pd->pd_version = 0; } else if (array != NULL) _PROP_FREE(array, M_PROP_DICT); return (pd); } static bool _prop_dictionary_expand(prop_dictionary_t pd, unsigned int capacity) { struct _prop_dict_entry *array, *oarray; /* * Dictionary must be WRITE-LOCKED. */ oarray = pd->pd_array; array = _PROP_CALLOC(capacity * sizeof(*array), M_PROP_DICT); if (array == NULL) return (false); if (oarray != NULL) memcpy(array, oarray, pd->pd_capacity * sizeof(*array)); pd->pd_array = array; pd->pd_capacity = capacity; if (oarray != NULL) _PROP_FREE(oarray, M_PROP_DICT); return (true); } static prop_object_t _prop_dictionary_iterator_next_object_locked(void *v) { struct _prop_dictionary_iterator *pdi = v; prop_dictionary_t pd = pdi->pdi_base.pi_obj; prop_dictionary_keysym_t pdk = NULL; _PROP_ASSERT(prop_object_is_dictionary(pd)); if (pd->pd_version != pdi->pdi_base.pi_version) goto out; /* dictionary changed during iteration */ _PROP_ASSERT(pdi->pdi_index <= pd->pd_count); if (pdi->pdi_index == pd->pd_count) goto out; /* we've iterated all objects */ pdk = pd->pd_array[pdi->pdi_index].pde_key; pdi->pdi_index++; out: return (pdk); } static prop_object_t _prop_dictionary_iterator_next_object(void *v) { struct _prop_dictionary_iterator *pdi = v; prop_dictionary_t pd _PROP_ARG_UNUSED = pdi->pdi_base.pi_obj; prop_dictionary_keysym_t pdk; _PROP_ASSERT(prop_object_is_dictionary(pd)); _PROP_RWLOCK_RDLOCK(pd->pd_rwlock); pdk = _prop_dictionary_iterator_next_object_locked(pdi); _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); return (pdk); } static void _prop_dictionary_iterator_reset_locked(void *v) { struct _prop_dictionary_iterator *pdi = v; prop_dictionary_t pd = pdi->pdi_base.pi_obj; _PROP_ASSERT(prop_object_is_dictionary(pd)); pdi->pdi_index = 0; pdi->pdi_base.pi_version = pd->pd_version; } static void _prop_dictionary_iterator_reset(void *v) { struct _prop_dictionary_iterator *pdi = v; prop_dictionary_t pd _PROP_ARG_UNUSED = pdi->pdi_base.pi_obj; _PROP_RWLOCK_RDLOCK(pd->pd_rwlock); _prop_dictionary_iterator_reset_locked(pdi); _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); } /* * prop_dictionary_create -- * Create a dictionary. */ prop_dictionary_t prop_dictionary_create(void) { return (_prop_dictionary_alloc(0)); } /* * prop_dictionary_create_with_capacity -- * Create a dictionary with the capacity to store N objects. */ prop_dictionary_t prop_dictionary_create_with_capacity(unsigned int capacity) { return (_prop_dictionary_alloc(capacity)); } /* * prop_dictionary_copy -- * Copy a dictionary. The new dictionary has an initial capacity equal * to the number of objects stored int the original dictionary. The new * dictionary contains references to the original dictionary's objects, * not copies of those objects (i.e. a shallow copy). */ prop_dictionary_t prop_dictionary_copy(prop_dictionary_t opd) { prop_dictionary_t pd; prop_dictionary_keysym_t pdk; prop_object_t po; unsigned int idx; if (! prop_object_is_dictionary(opd)) return (NULL); _PROP_RWLOCK_RDLOCK(opd->pd_rwlock); pd = _prop_dictionary_alloc(opd->pd_count); if (pd != NULL) { for (idx = 0; idx < opd->pd_count; idx++) { pdk = opd->pd_array[idx].pde_key; po = opd->pd_array[idx].pde_objref; prop_object_retain(pdk); prop_object_retain(po); pd->pd_array[idx].pde_key = pdk; pd->pd_array[idx].pde_objref = po; } pd->pd_count = opd->pd_count; pd->pd_flags = opd->pd_flags; } _PROP_RWLOCK_UNLOCK(opd->pd_rwlock); return (pd); } /* * prop_dictionary_copy_mutable -- * Like prop_dictionary_copy(), but the resulting dictionary is * mutable. */ prop_dictionary_t prop_dictionary_copy_mutable(prop_dictionary_t opd) { prop_dictionary_t pd; if (! prop_object_is_dictionary(opd)) return (NULL); pd = prop_dictionary_copy(opd); if (pd != NULL) pd->pd_flags &= ~PD_F_IMMUTABLE; return (pd); } /* * prop_dictionary_make_immutable -- * Set the immutable flag on that dictionary. */ void prop_dictionary_make_immutable(prop_dictionary_t pd) { _PROP_RWLOCK_WRLOCK(pd->pd_rwlock); if (prop_dictionary_is_immutable(pd) == false) pd->pd_flags |= PD_F_IMMUTABLE; _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); } /* * prop_dictionary_count -- * Return the number of objects stored in the dictionary. */ unsigned int prop_dictionary_count(prop_dictionary_t pd) { unsigned int rv; if (! prop_object_is_dictionary(pd)) return (0); _PROP_RWLOCK_RDLOCK(pd->pd_rwlock); rv = pd->pd_count; _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); return (rv); } /* * prop_dictionary_ensure_capacity -- * Ensure that the dictionary has the capacity to store the specified * total number of objects (including the objects already stored in * the dictionary). */ bool prop_dictionary_ensure_capacity(prop_dictionary_t pd, unsigned int capacity) { bool rv; if (! prop_object_is_dictionary(pd)) return (false); _PROP_RWLOCK_WRLOCK(pd->pd_rwlock); if (capacity > pd->pd_capacity) rv = _prop_dictionary_expand(pd, capacity); else rv = true; _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); return (rv); } static prop_object_iterator_t _prop_dictionary_iterator_locked(prop_dictionary_t pd) { struct _prop_dictionary_iterator *pdi; if (! prop_object_is_dictionary(pd)) return (NULL); pdi = _PROP_CALLOC(sizeof(*pdi), M_TEMP); if (pdi == NULL) return (NULL); pdi->pdi_base.pi_next_object = _prop_dictionary_iterator_next_object; pdi->pdi_base.pi_reset = _prop_dictionary_iterator_reset; prop_object_retain(pd); pdi->pdi_base.pi_obj = pd; _prop_dictionary_iterator_reset_locked(pdi); return (&pdi->pdi_base); } /* * prop_dictionary_iterator -- * Return an iterator for the dictionary. The dictionary is retained by * the iterator. */ prop_object_iterator_t prop_dictionary_iterator(prop_dictionary_t pd) { prop_object_iterator_t pi; _PROP_RWLOCK_RDLOCK(pd->pd_rwlock); pi = _prop_dictionary_iterator_locked(pd); _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); return (pi); } /* * prop_dictionary_all_keys -- * Return an array containing a snapshot of all of the keys * in the dictionary. */ prop_array_t prop_dictionary_all_keys(prop_dictionary_t pd) { prop_array_t array; unsigned int idx; bool rv = true; if (! prop_object_is_dictionary(pd)) return (NULL); /* There is no pressing need to lock the dictionary for this. */ array = prop_array_create_with_capacity(pd->pd_count); _PROP_RWLOCK_RDLOCK(pd->pd_rwlock); for (idx = 0; idx < pd->pd_count; idx++) { rv = prop_array_add(array, pd->pd_array[idx].pde_key); if (rv == false) break; } _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); if (rv == false) { prop_object_release(array); array = NULL; } return (array); } static struct _prop_dict_entry * _prop_dict_lookup(prop_dictionary_t pd, const char *key, unsigned int *idxp) { struct _prop_dict_entry *pde; unsigned int base, idx, distance; int res; /* * Dictionary must be READ-LOCKED or WRITE-LOCKED. */ for (idx = 0, base = 0, distance = pd->pd_count; distance != 0; distance >>= 1) { idx = base + (distance >> 1); pde = &pd->pd_array[idx]; _PROP_ASSERT(pde->pde_key != NULL); res = strcmp(key, pde->pde_key->pdk_key); if (res == 0) { if (idxp != NULL) *idxp = idx; return (pde); } if (res > 0) { /* key > pdk_key: move right */ base = idx + 1; distance--; } /* else move left */ } /* idx points to the slot we looked at last. */ if (idxp != NULL) *idxp = idx; return (NULL); } static prop_object_t _prop_dictionary_get(prop_dictionary_t pd, const char *key, bool locked) { const struct _prop_dict_entry *pde; prop_object_t po = NULL; if (! prop_object_is_dictionary(pd)) return (NULL); if (!locked) { _PROP_RWLOCK_RDLOCK(pd->pd_rwlock); } pde = _prop_dict_lookup(pd, key, NULL); if (pde != NULL) { _PROP_ASSERT(pde->pde_objref != NULL); po = pde->pde_objref; } if (!locked) { _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); } return (po); } /* * prop_dictionary_get -- * Return the object stored with specified key. */ prop_object_t prop_dictionary_get(prop_dictionary_t pd, const char *key) { prop_object_t po = NULL; if (! prop_object_is_dictionary(pd)) return (NULL); _PROP_RWLOCK_RDLOCK(pd->pd_rwlock); po = _prop_dictionary_get(pd, key, true); _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); return (po); } static prop_object_t _prop_dictionary_get_keysym(prop_dictionary_t pd, prop_dictionary_keysym_t pdk, bool locked) { if (! (prop_object_is_dictionary(pd) && prop_object_is_dictionary_keysym(pdk))) return (NULL); return (_prop_dictionary_get(pd, pdk->pdk_key, locked)); } /* * prop_dictionary_get_keysym -- * Return the object stored at the location encoded by the keysym. */ prop_object_t prop_dictionary_get_keysym(prop_dictionary_t pd, prop_dictionary_keysym_t pdk) { return (_prop_dictionary_get_keysym(pd, pdk, false)); } /* * prop_dictionary_set -- * Store a reference to an object at with the specified key. * If the key already exist, the original object is released. */ bool prop_dictionary_set(prop_dictionary_t pd, const char *key, prop_object_t po) { struct _prop_dict_entry *pde; prop_dictionary_keysym_t pdk; unsigned int idx; bool rv = false; if (! prop_object_is_dictionary(pd)) return (false); _PROP_ASSERT(pd->pd_count <= pd->pd_capacity); if (prop_dictionary_is_immutable(pd)) return (false); _PROP_RWLOCK_WRLOCK(pd->pd_rwlock); pde = _prop_dict_lookup(pd, key, &idx); if (pde != NULL) { prop_object_t opo = pde->pde_objref; prop_object_retain(po); pde->pde_objref = po; prop_object_release(opo); rv = true; goto out; } pdk = _prop_dict_keysym_alloc(key); if (pdk == NULL) goto out; if (pd->pd_count == pd->pd_capacity && _prop_dictionary_expand(pd, pd->pd_capacity + EXPAND_STEP) == false) { prop_object_release(pdk); goto out; } /* At this point, the store will succeed. */ prop_object_retain(po); if (pd->pd_count == 0) { pd->pd_array[0].pde_key = pdk; pd->pd_array[0].pde_objref = po; pd->pd_count++; pd->pd_version++; rv = true; goto out; } pde = &pd->pd_array[idx]; _PROP_ASSERT(pde->pde_key != NULL); if (strcmp(key, pde->pde_key->pdk_key) < 0) { /* * key < pdk_key: insert to the left. This is the same as * inserting to the right, except we decrement the current * index first. * * Because we're unsigned, we have to special case 0 * (grumble). */ if (idx == 0) { memmove(&pd->pd_array[1], &pd->pd_array[0], pd->pd_count * sizeof(*pde)); pd->pd_array[0].pde_key = pdk; pd->pd_array[0].pde_objref = po; pd->pd_count++; pd->pd_version++; rv = true; goto out; } idx--; } memmove(&pd->pd_array[idx + 2], &pd->pd_array[idx + 1], (pd->pd_count - (idx + 1)) * sizeof(*pde)); pd->pd_array[idx + 1].pde_key = pdk; pd->pd_array[idx + 1].pde_objref = po; pd->pd_count++; pd->pd_version++; rv = true; out: _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); return (rv); } /* * prop_dictionary_set_keysym -- * Replace the object in the dictionary at the location encoded by * the keysym. */ bool prop_dictionary_set_keysym(prop_dictionary_t pd, prop_dictionary_keysym_t pdk, prop_object_t po) { if (! (prop_object_is_dictionary(pd) && prop_object_is_dictionary_keysym(pdk))) return (false); return (prop_dictionary_set(pd, pdk->pdk_key, po)); } static void _prop_dictionary_remove(prop_dictionary_t pd, struct _prop_dict_entry *pde, unsigned int idx) { prop_dictionary_keysym_t pdk = pde->pde_key; prop_object_t po = pde->pde_objref; /* * Dictionary must be WRITE-LOCKED. */ _PROP_ASSERT(pd->pd_count != 0); _PROP_ASSERT(idx < pd->pd_count); _PROP_ASSERT(pde == &pd->pd_array[idx]); idx++; memmove(&pd->pd_array[idx - 1], &pd->pd_array[idx], (pd->pd_count - idx) * sizeof(*pde)); pd->pd_count--; pd->pd_version++; prop_object_release(pdk); prop_object_release(po); } /* * prop_dictionary_remove -- * Remove the reference to an object with the specified key from * the dictionary. */ void prop_dictionary_remove(prop_dictionary_t pd, const char *key) { struct _prop_dict_entry *pde; unsigned int idx; if (! prop_object_is_dictionary(pd)) return; _PROP_RWLOCK_WRLOCK(pd->pd_rwlock); /* XXX Should this be a _PROP_ASSERT()? */ if (prop_dictionary_is_immutable(pd)) goto out; pde = _prop_dict_lookup(pd, key, &idx); /* XXX Should this be a _PROP_ASSERT()? */ if (pde == NULL) goto out; _prop_dictionary_remove(pd, pde, idx); out: _PROP_RWLOCK_UNLOCK(pd->pd_rwlock); } /* * prop_dictionary_remove_keysym -- * Remove a reference to an object stored in the dictionary at the * location encoded by the keysym. */ void prop_dictionary_remove_keysym(prop_dictionary_t pd, prop_dictionary_keysym_t pdk) { if (! (prop_object_is_dictionary(pd) && prop_object_is_dictionary_keysym(pdk))) return; prop_dictionary_remove(pd, pdk->pdk_key); } /* * prop_dictionary_equals -- * Return true if the two dictionaries are equivalent. Note we do a * by-value comparison of the objects in the dictionary. */ bool prop_dictionary_equals(prop_dictionary_t dict1, prop_dictionary_t dict2) { if (!prop_object_is_dictionary(dict1) || !prop_object_is_dictionary(dict2)) return (false); return (prop_object_equals(dict1, dict2)); } /* * prop_dictionary_keysym_value -- * Return a reference to the keysym's value. */ const char * prop_dictionary_keysym_value(prop_dictionary_keysym_t pdk) { if (! prop_object_is_dictionary_keysym(pdk)) return (NULL); return (pdk->pdk_key); } _PROP_DEPRECATED(prop_dictionary_keysym_cstring_nocopy, "this program uses prop_dictionary_keysym_cstring_nocopy(), " "which is deprecated; use prop_dictionary_keysym_value() instead.") const char * prop_dictionary_keysym_cstring_nocopy(prop_dictionary_keysym_t pdk) { if (! prop_object_is_dictionary_keysym(pdk)) return (NULL); return (pdk->pdk_key); } /* * prop_dictionary_keysym_equals -- * Return true if the two dictionary key symbols are equivalent. * Note: We do not compare the object references. */ bool prop_dictionary_keysym_equals(prop_dictionary_keysym_t pdk1, prop_dictionary_keysym_t pdk2) { if (!prop_object_is_dictionary_keysym(pdk1) || !prop_object_is_dictionary_keysym(pdk2)) return (false); return (prop_object_equals(pdk1, pdk2)); } /* * prop_dictionary_externalize -- * Externalize a dictionary, returning a NUL-terminated buffer * containing the XML-style representation. The buffer is allocated * with the M_TEMP memory type. */ char * prop_dictionary_externalize(prop_dictionary_t pd) { struct _prop_object_externalize_context *ctx; char *cp; ctx = _prop_object_externalize_context_alloc(); if (ctx == NULL) return (NULL); if (_prop_object_externalize_header(ctx) == false || (*pd->pd_obj.po_type->pot_extern)(ctx, pd) == false || _prop_object_externalize_footer(ctx) == false) { /* We are responsible for releasing the buffer. */ _PROP_FREE(ctx->poec_buf, M_TEMP); _prop_object_externalize_context_free(ctx); return (NULL); } cp = ctx->poec_buf; _prop_object_externalize_context_free(ctx); return (cp); } /* * _prop_dictionary_internalize -- * Parse a <dict>...</dict> and return the object created from the * external representation. * * Internal state in via rec_data is the storage area for the last processed * key. * _prop_dictionary_internalize_body is the upper half of the parse loop. * It is responsible for parsing the key directly and storing it in the area * referenced by rec_data. * _prop_dictionary_internalize_cont is the lower half and called with the value * associated with the key. */ static bool _prop_dictionary_internalize_body(prop_stack_t, prop_object_t *, struct _prop_object_internalize_context *, char *); bool _prop_dictionary_internalize(prop_stack_t stack, prop_object_t *obj, struct _prop_object_internalize_context *ctx) { prop_dictionary_t dict; char *tmpkey; /* We don't currently understand any attributes. */ if (ctx->poic_tagattr != NULL) return (true); dict = prop_dictionary_create(); if (dict == NULL) return (true); if (ctx->poic_is_empty_element) { *obj = dict; return (true); } tmpkey = _PROP_MALLOC(PDK_MAXKEY + 1, M_TEMP); if (tmpkey == NULL) { prop_object_release(dict); return (true); } *obj = dict; /* * Opening tag is found, storage for key allocated and * now continue to the first element. */ return _prop_dictionary_internalize_body(stack, obj, ctx, tmpkey); } static bool _prop_dictionary_internalize_continue(prop_stack_t stack, prop_object_t *obj, struct _prop_object_internalize_context *ctx, void *data, prop_object_t child) { prop_dictionary_t dict = *obj; char *tmpkey = data; _PROP_ASSERT(tmpkey != NULL); if (child == NULL || prop_dictionary_set(dict, tmpkey, child) == false) { _PROP_FREE(tmpkey, M_TEMP); if (child != NULL) prop_object_release(child); prop_object_release(dict); *obj = NULL; return (true); } prop_object_release(child); /* * key, value was added, now continue looking for the next key * or the closing tag. */ return _prop_dictionary_internalize_body(stack, obj, ctx, tmpkey); } static bool _prop_dictionary_internalize_body(prop_stack_t stack, prop_object_t *obj, struct _prop_object_internalize_context *ctx, char *tmpkey) { prop_dictionary_t dict = *obj; size_t keylen; /* Fetch the next tag. */ if (_prop_object_internalize_find_tag(ctx, NULL, _PROP_TAG_TYPE_EITHER) == false) goto bad; /* Check to see if this is the end of the dictionary. */ if (_PROP_TAG_MATCH(ctx, "dict") && ctx->poic_tag_type == _PROP_TAG_TYPE_END) { _PROP_FREE(tmpkey, M_TEMP); return (true); } /* Ok, it must be a non-empty key start tag. */ if (!_PROP_TAG_MATCH(ctx, "key") || ctx->poic_tag_type != _PROP_TAG_TYPE_START || ctx->poic_is_empty_element) goto bad; if (_prop_object_internalize_decode_string(ctx, tmpkey, PDK_MAXKEY, &keylen, &ctx->poic_cp) == false) goto bad; _PROP_ASSERT(keylen <= PDK_MAXKEY); tmpkey[keylen] = '\0'; if (_prop_object_internalize_find_tag(ctx, "key", _PROP_TAG_TYPE_END) == false) goto bad; /* ..and now the beginning of the value. */ if (_prop_object_internalize_find_tag(ctx, NULL, _PROP_TAG_TYPE_START) == false) goto bad; /* * Key is found, now wait for value to be parsed. */ if (_prop_stack_push(stack, *obj, _prop_dictionary_internalize_continue, tmpkey, NULL)) return (false); bad: _PROP_FREE(tmpkey, M_TEMP); prop_object_release(dict); *obj = NULL; return (true); } /* * prop_dictionary_internalize -- * Create a dictionary by parsing the NUL-terminated XML-style * representation. */ prop_dictionary_t prop_dictionary_internalize(const char *xml) { return _prop_generic_internalize(xml, "dict"); } #if !defined(_KERNEL) && !defined(_STANDALONE) /* * prop_dictionary_externalize_to_file -- * Externalize a dictionary to the specified file. */ bool prop_dictionary_externalize_to_file(prop_dictionary_t dict, const char *fname) { char *xml; bool rv; int save_errno = 0; /* XXXGCC -Wuninitialized [mips, ...] */ xml = prop_dictionary_externalize(dict); if (xml == NULL) return (false); rv = _prop_object_externalize_write_file(fname, xml, strlen(xml)); if (rv == false) save_errno = errno; _PROP_FREE(xml, M_TEMP); if (rv == false) errno = save_errno; return (rv); } /* * prop_dictionary_internalize_from_file -- * Internalize a dictionary from a file. */ prop_dictionary_t prop_dictionary_internalize_from_file(const char *fname) { struct _prop_object_internalize_mapped_file *mf; prop_dictionary_t dict; mf = _prop_object_internalize_map_file(fname); if (mf == NULL) return (NULL); dict = prop_dictionary_internalize(mf->poimf_xml); _prop_object_internalize_unmap_file(mf); return (dict); } #endif /* !_KERNEL && !_STANDALONE */
102 103 45 68 81 28 84 25 16 15 102 25 86 84 87 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 /* $NetBSD: uvm_fault_i.h,v 1.33 2020/02/23 15:46:43 ad Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: Id: uvm_fault_i.h,v 1.1.6.1 1997/12/08 16:07:12 chuck Exp */ #ifndef _UVM_UVM_FAULT_I_H_ #define _UVM_UVM_FAULT_I_H_ /* * uvm_fault_i.h: fault inline functions */ void uvmfault_update_stats(struct uvm_faultinfo *); /* * uvmfault_unlockmaps: unlock the maps */ static __inline void uvmfault_unlockmaps(struct uvm_faultinfo *ufi, bool write_locked) { /* * ufi can be NULL when this isn't really a fault, * but merely paging in anon data. */ if (ufi == NULL) { return; } #ifndef __HAVE_NO_PMAP_STATS uvmfault_update_stats(ufi); #endif if (write_locked) { vm_map_unlock(ufi->map); } else { vm_map_unlock_read(ufi->map); } } /* * uvmfault_unlockall: unlock everything passed in. * * => maps must be read-locked (not write-locked). */ static __inline void uvmfault_unlockall(struct uvm_faultinfo *ufi, struct vm_amap *amap, struct uvm_object *uobj) { if (uobj) rw_exit(uobj->vmobjlock); if (amap) amap_unlock(amap); uvmfault_unlockmaps(ufi, false); } /* * uvmfault_lookup: lookup a virtual address in a map * * => caller must provide a uvm_faultinfo structure with the IN * params properly filled in * => we will lookup the map entry (handling submaps) as we go * => if the lookup is a success we will return with the maps locked * => if "write_lock" is true, we write_lock the map, otherwise we only * get a read lock. * => note that submaps can only appear in the kernel and they are * required to use the same virtual addresses as the map they * are referenced by (thus address translation between the main * map and the submap is unnecessary). */ static __inline bool uvmfault_lookup(struct uvm_faultinfo *ufi, bool write_lock) { struct vm_map *tmpmap; /* * init ufi values for lookup. */ ufi->map = ufi->orig_map; ufi->size = ufi->orig_size; /* * keep going down levels until we are done. note that there can * only be two levels so we won't loop very long. */ for (;;) { /* * lock map */ if (write_lock) { vm_map_lock(ufi->map); } else { vm_map_lock_read(ufi->map); } /* * lookup */ if (!uvm_map_lookup_entry(ufi->map, ufi->orig_rvaddr, &ufi->entry)) { uvmfault_unlockmaps(ufi, write_lock); return(false); } /* * reduce size if necessary */ if (ufi->entry->end - ufi->orig_rvaddr < ufi->size) ufi->size = ufi->entry->end - ufi->orig_rvaddr; /* * submap? replace map with the submap and lookup again. * note: VAs in submaps must match VAs in main map. */ if (UVM_ET_ISSUBMAP(ufi->entry)) { tmpmap = ufi->entry->object.sub_map; if (write_lock) { vm_map_unlock(ufi->map); } else { vm_map_unlock_read(ufi->map); } ufi->map = tmpmap; continue; } /* * got it! */ ufi->mapv = ufi->map->timestamp; return(true); } /* while loop */ /*NOTREACHED*/ } /* * uvmfault_relock: attempt to relock the same version of the map * * => fault data structures should be unlocked before calling. * => if a success (true) maps will be locked after call. */ static __inline bool uvmfault_relock(struct uvm_faultinfo *ufi) { /* * ufi can be NULL when this isn't really a fault, * but merely paging in anon data. */ if (ufi == NULL) { return true; } cpu_count(CPU_COUNT_FLTRELCK, 1); /* * relock map. fail if version mismatch (in which case nothing * gets locked). */ vm_map_lock_read(ufi->map); if (ufi->mapv != ufi->map->timestamp) { vm_map_unlock_read(ufi->map); return(false); } cpu_count(CPU_COUNT_FLTRELCKOK, 1); return(true); } #endif /* _UVM_UVM_FAULT_I_H_ */
3 3 3 11 11 11 11 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 /* $NetBSD: genfs_vfsops.c,v 1.11 2022/07/08 07:42:06 hannken Exp $ */ /*- * Copyright (c) 2008, 2009, 2019 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: genfs_vfsops.c,v 1.11 2022/07/08 07:42:06 hannken Exp $"); #include <sys/types.h> #include <sys/mount.h> #include <sys/fstrans.h> #include <sys/statvfs.h> #include <sys/vnode.h> #include <miscfs/genfs/genfs.h> #include <miscfs/genfs/genfs_node.h> int genfs_statvfs(struct mount *mp, struct statvfs *sbp) { sbp->f_bsize = DEV_BSIZE; sbp->f_frsize = DEV_BSIZE; sbp->f_iosize = DEV_BSIZE; sbp->f_blocks = 2; /* 1k to keep df happy */ sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_bresvd = 0; sbp->f_files = 0; sbp->f_ffree = 0; sbp->f_favail = 0; sbp->f_fresvd = 0; copy_statvfs_info(sbp, mp); return 0; } int genfs_renamelock_enter(struct mount *mp) { mutex_enter(mp->mnt_renamelock); /* Preserve possible error return in case we become interruptible. */ return 0; } void genfs_renamelock_exit(struct mount *mp) { mutex_exit(mp->mnt_renamelock); } int genfs_suspendctl(struct mount *mp, int cmd) { int error; switch (cmd) { case SUSPEND_SUSPEND: error = fstrans_setstate(mp, FSTRANS_SUSPENDING); if (error) return error; error = fstrans_setstate(mp, FSTRANS_SUSPENDED); return error; case SUSPEND_RESUME: error = fstrans_setstate(mp, FSTRANS_NORMAL); KASSERT(error == 0); return 0; default: panic("%s: bogus command %d", __func__, cmd); } }
48 26 2 1 72 71 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 /* $NetBSD: subr_specificdata.c,v 1.14 2017/06/01 02:45:13 chs Exp $ */ /*- * Copyright (c) 2006, 2007 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2006 YAMAMOTO Takashi. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_specificdata.c,v 1.14 2017/06/01 02:45:13 chs Exp $"); #include <sys/param.h> #include <sys/kmem.h> #include <sys/specificdata.h> #include <sys/queue.h> #include <sys/mutex.h> /* * Locking notes: * * The specdataref_container pointer in the specificdata_reference * is volatile. To read it, you must hold EITHER the domain lock * or the ref lock. To write it, you must hold BOTH the domain lock * and the ref lock. The locks must be acquired in the following * order: * domain -> ref */ typedef struct { specificdata_dtor_t ski_dtor; } specificdata_key_impl; struct specificdata_container { size_t sc_nkey; LIST_ENTRY(specificdata_container) sc_list; void * sc_data[]; /* variable length */ }; #define SPECIFICDATA_CONTAINER_BYTESIZE(n) \ (sizeof(struct specificdata_container) + ((n) * sizeof(void *))) struct specificdata_domain { kmutex_t sd_lock; unsigned int sd_nkey; LIST_HEAD(, specificdata_container) sd_list; specificdata_key_impl *sd_keys; }; static void specificdata_container_link(specificdata_domain_t sd, specificdata_container_t sc) { LIST_INSERT_HEAD(&sd->sd_list, sc, sc_list); } static void specificdata_container_unlink(specificdata_domain_t sd, specificdata_container_t sc) { LIST_REMOVE(sc, sc_list); } static void specificdata_destroy_datum(specificdata_domain_t sd, specificdata_container_t sc, specificdata_key_t key) { specificdata_dtor_t dtor; void *data; if (key >= sc->sc_nkey) return; KASSERT(key < sd->sd_nkey); data = sc->sc_data[key]; dtor = sd->sd_keys[key].ski_dtor; if (dtor != NULL) { if (data != NULL) { sc->sc_data[key] = NULL; (*dtor)(data); } } else { KASSERT(data == NULL); } } static void specificdata_noop_dtor(void *data) { /* nothing */ } /* * specificdata_domain_create -- * Create a specificdata domain. */ specificdata_domain_t specificdata_domain_create(void) { specificdata_domain_t sd; sd = kmem_zalloc(sizeof(*sd), KM_SLEEP); mutex_init(&sd->sd_lock, MUTEX_DEFAULT, IPL_NONE); LIST_INIT(&sd->sd_list); return (sd); } /* * specificdata_domain_delete -- * Destroy a specificdata domain. */ void specificdata_domain_delete(specificdata_domain_t sd) { panic("specificdata_domain_delete: not implemented"); } /* * specificdata_key_create -- * Create a specificdata key for a domain. * * Note: This is a rare operation. */ int specificdata_key_create(specificdata_domain_t sd, specificdata_key_t *keyp, specificdata_dtor_t dtor) { specificdata_key_impl *newkeys; specificdata_key_t key = 0; size_t nsz; ASSERT_SLEEPABLE(); if (dtor == NULL) dtor = specificdata_noop_dtor; mutex_enter(&sd->sd_lock); if (sd->sd_keys == NULL) goto needalloc; for (; key < sd->sd_nkey; key++) { if (sd->sd_keys[key].ski_dtor == NULL) goto gotit; } needalloc: nsz = (sd->sd_nkey + 1) * sizeof(*newkeys); /* XXXSMP allocating memory while holding a lock. */ newkeys = kmem_zalloc(nsz, KM_SLEEP); if (sd->sd_keys != NULL) { size_t osz = sd->sd_nkey * sizeof(*newkeys); memcpy(newkeys, sd->sd_keys, osz); kmem_free(sd->sd_keys, osz); } sd->sd_keys = newkeys; sd->sd_nkey++; gotit: sd->sd_keys[key].ski_dtor = dtor; mutex_exit(&sd->sd_lock); *keyp = key; return (0); } /* * specificdata_key_delete -- * Destroy a specificdata key for a domain. * * Note: This is a rare operation. */ void specificdata_key_delete(specificdata_domain_t sd, specificdata_key_t key) { specificdata_container_t sc; mutex_enter(&sd->sd_lock); if (key >= sd->sd_nkey) goto out; /* * Traverse all of the specificdata containers in the domain * and the destroy the datum for the dying key. */ LIST_FOREACH(sc, &sd->sd_list, sc_list) { specificdata_destroy_datum(sd, sc, key); } sd->sd_keys[key].ski_dtor = NULL; out: mutex_exit(&sd->sd_lock); } /* * specificdata_init -- * Initialize a specificdata container for operation in the * specified domain. */ int specificdata_init(specificdata_domain_t sd, specificdata_reference *ref) { /* * Just NULL-out the container pointer; we'll allocate the * container the first time specificdata is put into it. */ ref->specdataref_container = NULL; mutex_init(&ref->specdataref_lock, MUTEX_DEFAULT, IPL_NONE); return (0); } /* * specificdata_fini -- * Destroy a specificdata container. We destroy all of the datums * stuffed into the container just as if the key were destroyed. */ void specificdata_fini(specificdata_domain_t sd, specificdata_reference *ref) { specificdata_container_t sc; specificdata_key_t key; ASSERT_SLEEPABLE(); mutex_destroy(&ref->specdataref_lock); sc = ref->specdataref_container; if (sc == NULL) return; ref->specdataref_container = NULL; mutex_enter(&sd->sd_lock); specificdata_container_unlink(sd, sc); for (key = 0; key < sc->sc_nkey; key++) { specificdata_destroy_datum(sd, sc, key); } mutex_exit(&sd->sd_lock); kmem_free(sc, SPECIFICDATA_CONTAINER_BYTESIZE(sc->sc_nkey)); } /* * specificdata_getspecific -- * Get a datum from a container. */ void * specificdata_getspecific(specificdata_domain_t sd, specificdata_reference *ref, specificdata_key_t key) { specificdata_container_t sc; void *data = NULL; mutex_enter(&ref->specdataref_lock); sc = ref->specdataref_container; if (sc != NULL && key < sc->sc_nkey) data = sc->sc_data[key]; mutex_exit(&ref->specdataref_lock); return (data); } /* * specificdata_getspecific_unlocked -- * Get a datum from a container in a lockless fashion. * * Note: When using this routine, care must be taken to ensure * that no other thread could cause the specificdata_reference * to become invalid (i.e. point at the wrong container) by * issuing a setspecific call or destroying the container. */ void * specificdata_getspecific_unlocked(specificdata_domain_t sd, specificdata_reference *ref, specificdata_key_t key) { specificdata_container_t sc; sc = ref->specdataref_container; if (sc != NULL && key < sc->sc_nkey) return (sc->sc_data[key]); return (NULL); } /* * specificdata_setspecific -- * Put a datum into a container. */ void specificdata_setspecific(specificdata_domain_t sd, specificdata_reference *ref, specificdata_key_t key, void *data) { specificdata_container_t sc, newsc; size_t newnkey, sz; ASSERT_SLEEPABLE(); mutex_enter(&ref->specdataref_lock); sc = ref->specdataref_container; if (__predict_true(sc != NULL && key < sc->sc_nkey)) { sc->sc_data[key] = data; mutex_exit(&ref->specdataref_lock); return; } mutex_exit(&ref->specdataref_lock); /* * Slow path: need to resize. */ mutex_enter(&sd->sd_lock); newnkey = sd->sd_nkey; if (key >= newnkey) { mutex_exit(&sd->sd_lock); panic("specificdata_setspecific"); } sz = SPECIFICDATA_CONTAINER_BYTESIZE(newnkey); newsc = kmem_zalloc(sz, KM_SLEEP); newsc->sc_nkey = newnkey; mutex_enter(&ref->specdataref_lock); sc = ref->specdataref_container; if (sc != NULL) { if (key < sc->sc_nkey) { /* * Someone beat us to the punch. Unwind and put * the object into the now large enough container. */ sc->sc_data[key] = data; mutex_exit(&ref->specdataref_lock); mutex_exit(&sd->sd_lock); kmem_free(newsc, sz); return; } specificdata_container_unlink(sd, sc); memcpy(newsc->sc_data, sc->sc_data, sc->sc_nkey * sizeof(void *)); } newsc->sc_data[key] = data; specificdata_container_link(sd, newsc); ref->specdataref_container = newsc; mutex_exit(&ref->specdataref_lock); mutex_exit(&sd->sd_lock); if (sc != NULL) kmem_free(sc, SPECIFICDATA_CONTAINER_BYTESIZE(sc->sc_nkey)); }
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 /* $NetBSD: union_subr.c,v 1.82 2022/07/18 04:30:30 thorpej Exp $ */ /* * Copyright (c) 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)union_subr.c 8.20 (Berkeley) 5/20/95 */ /* * Copyright (c) 1994 Jan-Simon Pendry * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)union_subr.c 8.20 (Berkeley) 5/20/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: union_subr.c,v 1.82 2022/07/18 04:30:30 thorpej Exp $"); #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/namei.h> #include <sys/malloc.h> #include <sys/dirent.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/queue.h> #include <sys/mount.h> #include <sys/stat.h> #include <sys/kauth.h> #include <uvm/uvm_extern.h> #include <fs/union/union.h> #include <miscfs/genfs/genfs.h> #include <miscfs/specfs/specdev.h> static LIST_HEAD(uhashhead, union_node) *uhashtbl; static u_long uhash_mask; /* size of hash table - 1 */ #define UNION_HASH(u, l) \ ((((u_long) (u) + (u_long) (l)) >> 8) & uhash_mask) #define NOHASH ((u_long)-1) static kmutex_t uhash_lock; static void union_newupper(struct union_node *, struct vnode *); static void union_newlower(struct union_node *, struct vnode *); static void union_ref(struct union_node *); static void union_rele(struct union_node *); static int union_do_lookup(struct vnode *, struct componentname *, kauth_cred_t, const char *); int union_vn_close(struct vnode *, int, kauth_cred_t, struct lwp *); static void union_dircache_r(struct vnode *, struct vnode ***, int *); struct vnode *union_dircache(struct vnode *, struct lwp *); void union_init(void) { mutex_init(&uhash_lock, MUTEX_DEFAULT, IPL_NONE); uhashtbl = hashinit(desiredvnodes, HASH_LIST, true, &uhash_mask); } void union_reinit(void) { struct union_node *un; struct uhashhead *oldhash, *hash; u_long oldmask, mask, val; int i; hash = hashinit(desiredvnodes, HASH_LIST, true, &mask); mutex_enter(&uhash_lock); oldhash = uhashtbl; oldmask = uhash_mask; uhashtbl = hash; uhash_mask = mask; for (i = 0; i <= oldmask; i++) { while ((un = LIST_FIRST(&oldhash[i])) != NULL) { LIST_REMOVE(un, un_cache); val = UNION_HASH(un->un_uppervp, un->un_lowervp); LIST_INSERT_HEAD(&hash[val], un, un_cache); } } mutex_exit(&uhash_lock); hashdone(oldhash, HASH_LIST, oldmask); } /* * Free global unionfs resources. */ void union_done(void) { hashdone(uhashtbl, HASH_LIST, uhash_mask); mutex_destroy(&uhash_lock); /* Make sure to unset the readdir hook. */ vn_union_readdir_hook = NULL; } void union_newlower(struct union_node *un, struct vnode *lowervp) { int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp); int nhash = UNION_HASH(un->un_uppervp, lowervp); if (un->un_lowervp == lowervp) return; KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE); KASSERT(un->un_lowervp == NULL); mutex_enter(&uhash_lock); if (ohash != nhash && (un->un_cflags & UN_CACHED)) { un->un_cflags &= ~UN_CACHED; LIST_REMOVE(un, un_cache); } mutex_enter(&un->un_lock); un->un_lowervp = lowervp; un->un_lowersz = VNOVAL; mutex_exit(&un->un_lock); if (ohash != nhash) { LIST_INSERT_HEAD(&uhashtbl[nhash], un, un_cache); un->un_cflags |= UN_CACHED; } mutex_exit(&uhash_lock); } void union_newupper(struct union_node *un, struct vnode *uppervp) { int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp); int nhash = UNION_HASH(uppervp, un->un_lowervp); struct vop_lock_args lock_ap; struct vop_unlock_args unlock_ap; int error __diagused; if (un->un_uppervp == uppervp) return; KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE); KASSERT(un->un_uppervp == NULL); /* * We have to transfer the vnode lock from the union vnode to * the upper vnode. Lock the upper vnode first. We cannot use * VOP_LOCK() here as it would break the fstrans state. */ lock_ap.a_desc = VDESC(vop_lock); lock_ap.a_vp = uppervp; lock_ap.a_flags = LK_EXCLUSIVE; error = VCALL(lock_ap.a_vp, VOFFSET(vop_lock), &lock_ap); KASSERT(error == 0); mutex_enter(&uhash_lock); if (ohash != nhash && (un->un_cflags & UN_CACHED)) { un->un_cflags &= ~UN_CACHED; LIST_REMOVE(un, un_cache); } mutex_enter(&un->un_lock); un->un_uppervp = uppervp; un->un_uppersz = VNOVAL; /* * With the upper vnode in place unlock the union vnode to * finalize the lock transfer. */ unlock_ap.a_desc = VDESC(vop_unlock); unlock_ap.a_vp = UNIONTOV(un); genfs_unlock(&unlock_ap); /* Update union vnode interlock, vmobjlock, & klist. */ vshareilock(UNIONTOV(un), uppervp); rw_obj_hold(uppervp->v_uobj.vmobjlock); uvm_obj_setlock(&UNIONTOV(un)->v_uobj, uppervp->v_uobj.vmobjlock); vshareklist(UNIONTOV(un), uppervp); mutex_exit(&un->un_lock); if (ohash != nhash) { LIST_INSERT_HEAD(&uhashtbl[nhash], un, un_cache); un->un_cflags |= UN_CACHED; } mutex_exit(&uhash_lock); } /* * Keep track of size changes in the underlying vnodes. * If the size changes, then callback to the vm layer * giving priority to the upper layer size. * * Mutex un_lock hold on entry and released on return. */ void union_newsize(struct vnode *vp, off_t uppersz, off_t lowersz) { struct union_node *un = VTOUNION(vp); off_t sz; KASSERT(mutex_owned(&un->un_lock)); /* only interested in regular files */ if (vp->v_type != VREG) { mutex_exit(&un->un_lock); uvm_vnp_setsize(vp, 0); return; } sz = VNOVAL; if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) { un->un_uppersz = uppersz; if (sz == VNOVAL) sz = un->un_uppersz; } if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) { un->un_lowersz = lowersz; if (sz == VNOVAL) sz = un->un_lowersz; } mutex_exit(&un->un_lock); if (sz != VNOVAL) { #ifdef UNION_DIAGNOSTIC printf("union: %s size now %qd\n", uppersz != VNOVAL ? "upper" : "lower", sz); #endif uvm_vnp_setsize(vp, sz); } } static void union_ref(struct union_node *un) { KASSERT(mutex_owned(&uhash_lock)); un->un_refs++; } static void union_rele(struct union_node *un) { mutex_enter(&uhash_lock); un->un_refs--; if (un->un_refs > 0) { mutex_exit(&uhash_lock); return; } if (un->un_cflags & UN_CACHED) { un->un_cflags &= ~UN_CACHED; LIST_REMOVE(un, un_cache); } mutex_exit(&uhash_lock); if (un->un_pvp != NULLVP) vrele(un->un_pvp); if (un->un_uppervp != NULLVP) vrele(un->un_uppervp); if (un->un_lowervp != NULLVP) vrele(un->un_lowervp); if (un->un_dirvp != NULLVP) vrele(un->un_dirvp); if (un->un_path) free(un->un_path, M_TEMP); mutex_destroy(&un->un_lock); free(un, M_TEMP); } /* * allocate a union_node/vnode pair. the vnode is * referenced and unlocked. the new vnode is returned * via (vpp). (mp) is the mountpoint of the union filesystem, * (dvp) is the parent directory where the upper layer object * should exist (but doesn't) and (cnp) is the componentname * information which is partially copied to allow the upper * layer object to be created at a later time. (uppervp) * and (lowervp) reference the upper and lower layer objects * being mapped. either, but not both, can be nil. * both, if supplied, are unlocked. * the reference is either maintained in the new union_node * object which is allocated, or they are vrele'd. * * all union_nodes are maintained on a hash * list. new nodes are only allocated when they cannot * be found on this list. entries on the list are * removed when the vfs reclaim entry is called. * * the vnode gets attached or referenced with vcache_get(). */ int union_allocvp( struct vnode **vpp, struct mount *mp, struct vnode *undvp, /* parent union vnode */ struct vnode *dvp, /* may be null */ struct componentname *cnp, /* may be null */ struct vnode *uppervp, /* may be null */ struct vnode *lowervp, /* may be null */ int docache) { int error; struct union_node *un = NULL, *un1; struct vnode *vp, *xlowervp = NULLVP; u_long hash[3]; int try; bool is_dotdot; is_dotdot = (dvp != NULL && cnp != NULL && (cnp->cn_flags & ISDOTDOT)); if (uppervp == NULLVP && lowervp == NULLVP) panic("union: unidentifiable allocation"); if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) { xlowervp = lowervp; lowervp = NULLVP; } /* * If both uppervp and lowervp are not NULL we have to * search union nodes with one vnode as NULL too. */ hash[0] = UNION_HASH(uppervp, lowervp); if (uppervp == NULL || lowervp == NULL) { hash[1] = hash[2] = NOHASH; } else { hash[1] = UNION_HASH(uppervp, NULLVP); hash[2] = UNION_HASH(NULLVP, lowervp); } if (!docache) { un = NULL; goto found; } loop: mutex_enter(&uhash_lock); for (try = 0; try < 3; try++) { if (hash[try] == NOHASH) continue; LIST_FOREACH(un, &uhashtbl[hash[try]], un_cache) { if ((un->un_lowervp && un->un_lowervp != lowervp) || (un->un_uppervp && un->un_uppervp != uppervp) || un->un_mount != mp) continue; union_ref(un); mutex_exit(&uhash_lock); error = vcache_get(mp, &un, sizeof(un), &vp); KASSERT(error != 0 || UNIONTOV(un) == vp); union_rele(un); if (error == ENOENT) goto loop; else if (error) goto out; goto found; } } mutex_exit(&uhash_lock); found: if (un) { if (uppervp != dvp) { if (is_dotdot) VOP_UNLOCK(dvp); vn_lock(UNIONTOV(un), LK_EXCLUSIVE | LK_RETRY); if (is_dotdot) vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); } /* * Save information about the upper layer. */ if (uppervp != un->un_uppervp) { union_newupper(un, uppervp); } else if (uppervp) { vrele(uppervp); } /* * Save information about the lower layer. * This needs to keep track of pathname * and directory information which union_vn_create * might need. */ if (lowervp != un->un_lowervp) { union_newlower(un, lowervp); if (cnp && (lowervp != NULLVP)) { un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK); memcpy(un->un_path, cnp->cn_nameptr, cnp->cn_namelen); un->un_path[cnp->cn_namelen] = '\0'; vref(dvp); un->un_dirvp = dvp; } } else if (lowervp) { vrele(lowervp); } *vpp = UNIONTOV(un); if (uppervp != dvp) VOP_UNLOCK(*vpp); error = 0; goto out; } un = malloc(sizeof(struct union_node), M_TEMP, M_WAITOK); mutex_init(&un->un_lock, MUTEX_DEFAULT, IPL_NONE); un->un_refs = 1; un->un_mount = mp; un->un_vnode = NULL; un->un_uppervp = uppervp; un->un_lowervp = lowervp; un->un_pvp = undvp; if (undvp != NULLVP) vref(undvp); un->un_dircache = 0; un->un_openl = 0; un->un_cflags = 0; un->un_hooknode = false; un->un_uppersz = VNOVAL; un->un_lowersz = VNOVAL; if (dvp && cnp && (lowervp != NULLVP)) { un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK); memcpy(un->un_path, cnp->cn_nameptr, cnp->cn_namelen); un->un_path[cnp->cn_namelen] = '\0'; vref(dvp); un->un_dirvp = dvp; } else { un->un_path = 0; un->un_dirvp = 0; } if (docache) { mutex_enter(&uhash_lock); LIST_FOREACH(un1, &uhashtbl[hash[0]], un_cache) { if (un1->un_lowervp == lowervp && un1->un_uppervp == uppervp && un1->un_mount == mp) { /* * Another thread beat us, push back freshly * allocated node and retry. */ mutex_exit(&uhash_lock); union_rele(un); goto loop; } } LIST_INSERT_HEAD(&uhashtbl[hash[0]], un, un_cache); un->un_cflags |= UN_CACHED; mutex_exit(&uhash_lock); } error = vcache_get(mp, &un, sizeof(un), vpp); KASSERT(error != 0 || UNIONTOV(un) == *vpp); union_rele(un); if (error == ENOENT) goto loop; out: if (xlowervp) vrele(xlowervp); return error; } int union_freevp(struct vnode *vp) { struct union_node *un = VTOUNION(vp); /* Detach vnode from union node. */ un->un_vnode = NULL; un->un_uppersz = VNOVAL; un->un_lowersz = VNOVAL; /* Detach union node from vnode. */ mutex_enter(vp->v_interlock); vp->v_data = NULL; mutex_exit(vp->v_interlock); union_rele(un); return 0; } int union_loadvnode(struct mount *mp, struct vnode *vp, const void *key, size_t key_len, const void **new_key) { struct vattr va; struct vnode *svp; struct union_node *un; struct union_mount *um; voff_t uppersz, lowersz; KASSERT(key_len == sizeof(un)); memcpy(&un, key, key_len); um = MOUNTTOUNIONMOUNT(mp); svp = (un->un_uppervp != NULLVP) ? un->un_uppervp : un->un_lowervp; vp->v_tag = VT_UNION; vp->v_op = union_vnodeop_p; vp->v_data = un; un->un_vnode = vp; vp->v_type = svp->v_type; if (svp->v_type == VCHR || svp->v_type == VBLK) spec_node_init(vp, svp->v_rdev); vshareilock(vp, svp); rw_obj_hold(svp->v_uobj.vmobjlock); uvm_obj_setlock(&vp->v_uobj, svp->v_uobj.vmobjlock); vshareklist(vp, svp); /* detect the root vnode (and aliases) */ if ((un->un_uppervp == um->um_uppervp) && ((un->un_lowervp == NULLVP) || un->un_lowervp == um->um_lowervp)) { if (un->un_lowervp == NULLVP) { un->un_lowervp = um->um_lowervp; if (un->un_lowervp != NULLVP) vref(un->un_lowervp); } vp->v_vflag |= VV_ROOT; } uppersz = lowersz = VNOVAL; if (un->un_uppervp != NULLVP) { if (vn_lock(un->un_uppervp, LK_SHARED) == 0) { if (VOP_GETATTR(un->un_uppervp, &va, FSCRED) == 0) uppersz = va.va_size; VOP_UNLOCK(un->un_uppervp); } } if (un->un_lowervp != NULLVP) { if (vn_lock(un->un_lowervp, LK_SHARED) == 0) { if (VOP_GETATTR(un->un_lowervp, &va, FSCRED) == 0) lowersz = va.va_size; VOP_UNLOCK(un->un_lowervp); } } mutex_enter(&un->un_lock); union_newsize(vp, uppersz, lowersz); mutex_enter(&uhash_lock); union_ref(un); mutex_exit(&uhash_lock); *new_key = &vp->v_data; return 0; } /* * copyfile. copy the vnode (fvp) to the vnode (tvp) * using a sequence of reads and writes. both (fvp) * and (tvp) are locked on entry and exit. */ int union_copyfile(struct vnode *fvp, struct vnode *tvp, kauth_cred_t cred, struct lwp *l) { char *tbuf; struct uio uio; struct iovec iov; int error = 0; /* * strategy: * allocate a buffer of size MAXBSIZE. * loop doing reads and writes, keeping track * of the current uio offset. * give up at the first sign of trouble. */ uio.uio_offset = 0; UIO_SETUP_SYSSPACE(&uio); tbuf = malloc(MAXBSIZE, M_TEMP, M_WAITOK); /* ugly loop follows... */ do { off_t offset = uio.uio_offset; uio.uio_iov = &iov; uio.uio_iovcnt = 1; iov.iov_base = tbuf; iov.iov_len = MAXBSIZE; uio.uio_resid = iov.iov_len; uio.uio_rw = UIO_READ; error = VOP_READ(fvp, &uio, 0, cred); if (error == 0) { uio.uio_iov = &iov; uio.uio_iovcnt = 1; iov.iov_base = tbuf; iov.iov_len = MAXBSIZE - uio.uio_resid; uio.uio_offset = offset; uio.uio_rw = UIO_WRITE; uio.uio_resid = iov.iov_len; if (uio.uio_resid == 0) break; do { error = VOP_WRITE(tvp, &uio, 0, cred); } while ((uio.uio_resid > 0) && (error == 0)); } } while (error == 0); free(tbuf, M_TEMP); return (error); } /* * (un) is assumed to be locked on entry and remains * locked on exit. */ int union_copyup(struct union_node *un, int docopy, kauth_cred_t cred, struct lwp *l) { int error; struct vnode *lvp, *uvp; struct vattr lvattr, uvattr; error = union_vn_create(&uvp, un, l); if (error) return (error); union_newupper(un, uvp); lvp = un->un_lowervp; if (docopy) { /* * XX - should not ignore errors * from VOP_CLOSE */ vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_GETATTR(lvp, &lvattr, cred); if (error == 0) error = VOP_OPEN(lvp, FREAD, cred); if (error == 0) { error = union_copyfile(lvp, uvp, cred, l); (void) VOP_CLOSE(lvp, FREAD, cred); } if (error == 0) { /* Copy permissions up too */ vattr_null(&uvattr); uvattr.va_mode = lvattr.va_mode; uvattr.va_flags = lvattr.va_flags; error = VOP_SETATTR(uvp, &uvattr, cred); } VOP_UNLOCK(lvp); #ifdef UNION_DIAGNOSTIC if (error == 0) uprintf("union: copied up %s\n", un->un_path); #endif } union_vn_close(uvp, FWRITE, cred, l); /* * Subsequent IOs will go to the top layer, so * call close on the lower vnode and open on the * upper vnode to ensure that the filesystem keeps * its references counts right. This doesn't do * the right thing with (cred) and (FREAD) though. * Ignoring error returns is not right, either. */ if (error == 0) { int i; vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY); for (i = 0; i < un->un_openl; i++) { (void) VOP_CLOSE(lvp, FREAD, cred); (void) VOP_OPEN(uvp, FREAD, cred); } un->un_openl = 0; VOP_UNLOCK(lvp); } return (error); } /* * Prepare the creation of a new node in the upper layer. * * (dvp) is the directory in which to create the new node. * it is locked on entry and exit. * (cnp) is the componentname to be created. * (cred, path, hash) are credentials, path and its hash to fill (cnp). */ static int union_do_lookup(struct vnode *dvp, struct componentname *cnp, kauth_cred_t cred, const char *path) { int error; struct vnode *vp; cnp->cn_nameiop = CREATE; cnp->cn_flags = LOCKPARENT | ISLASTCN; cnp->cn_cred = cred; cnp->cn_nameptr = path; cnp->cn_namelen = strlen(path); error = VOP_LOOKUP(dvp, &vp, cnp); if (error == 0) { KASSERT(vp != NULL); VOP_ABORTOP(dvp, cnp); vrele(vp); error = EEXIST; } else if (error == EJUSTRETURN) { error = 0; } return error; } /* * Create a shadow directory in the upper layer. * The new vnode is returned locked. * * (um) points to the union mount structure for access to the * the mounting process's credentials. * (dvp) is the directory in which to create the shadow directory. * it is unlocked on entry and exit. * (cnp) is the componentname to be created. * (vpp) is the returned newly created shadow directory, which * is returned locked. * * N.B. We still attempt to create shadow directories even if the union * is mounted read-only, which is a little nonintuitive. */ int union_mkshadow(struct union_mount *um, struct vnode *dvp, struct componentname *cnp, struct vnode **vpp) { int error; struct vattr va; struct componentname cn; char *pnbuf; if (cnp->cn_namelen + 1 > MAXPATHLEN) return ENAMETOOLONG; pnbuf = PNBUF_GET(); memcpy(pnbuf, cnp->cn_nameptr, cnp->cn_namelen); pnbuf[cnp->cn_namelen] = '\0'; vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); error = union_do_lookup(dvp, &cn, (um->um_op == UNMNT_ABOVE ? cnp->cn_cred : um->um_cred), pnbuf); if (error) { VOP_UNLOCK(dvp); PNBUF_PUT(pnbuf); return error; } /* * policy: when creating the shadow directory in the * upper layer, create it owned by the user who did * the mount, group from parent directory, and mode * 777 modified by umask (ie mostly identical to the * mkdir syscall). (jsp, kb) */ vattr_null(&va); va.va_type = VDIR; va.va_mode = um->um_cmode; KASSERT(*vpp == NULL); error = VOP_MKDIR(dvp, vpp, &cn, &va); VOP_UNLOCK(dvp); PNBUF_PUT(pnbuf); return error; } /* * Create a whiteout entry in the upper layer. * * (um) points to the union mount structure for access to the * the mounting process's credentials. * (dvp) is the directory in which to create the whiteout. * it is locked on entry and exit. * (cnp) is the componentname to be created. * (un) holds the path and its hash to be created. */ int union_mkwhiteout(struct union_mount *um, struct vnode *dvp, struct componentname *cnp, struct union_node *un) { int error; struct componentname cn; error = union_do_lookup(dvp, &cn, (um->um_op == UNMNT_ABOVE ? cnp->cn_cred : um->um_cred), un->un_path); if (error) return error; error = VOP_WHITEOUT(dvp, &cn, CREATE); return error; } /* * union_vn_create: creates and opens a new shadow file * on the upper union layer. this function is similar * in spirit to calling vn_open but it avoids calling namei(). * the problem with calling namei is that a) it locks too many * things, and b) it doesn't start at the "right" directory, * whereas union_do_lookup is told where to start. */ int union_vn_create(struct vnode **vpp, struct union_node *un, struct lwp *l) { struct vnode *vp; kauth_cred_t cred = l->l_cred; struct vattr vat; struct vattr *vap = &vat; int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL); int error; int cmode = UN_FILEMODE & ~l->l_proc->p_cwdi->cwdi_cmask; struct componentname cn; *vpp = NULLVP; vn_lock(un->un_dirvp, LK_EXCLUSIVE | LK_RETRY); error = union_do_lookup(un->un_dirvp, &cn, l->l_cred, un->un_path); if (error) { VOP_UNLOCK(un->un_dirvp); return error; } /* * Good - there was no race to create the file * so go ahead and create it. The permissions * on the file will be 0666 modified by the * current user's umask. Access to the file, while * it is unioned, will require access to the top *and* * bottom files. Access when not unioned will simply * require access to the top-level file. * TODO: confirm choice of access permissions. */ vattr_null(vap); vap->va_type = VREG; vap->va_mode = cmode; vp = NULL; error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap); if (error) { VOP_UNLOCK(un->un_dirvp); return error; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VOP_UNLOCK(un->un_dirvp); error = VOP_OPEN(vp, fmode, cred); if (error) { vput(vp); return error; } vp->v_writecount++; VOP_UNLOCK(vp); *vpp = vp; return 0; } int union_vn_close(struct vnode *vp, int fmode, kauth_cred_t cred, struct lwp *l) { if (fmode & FWRITE) --vp->v_writecount; return (VOP_CLOSE(vp, fmode, cred)); } void union_removed_upper(struct union_node *un) { struct vnode *vp = UNIONTOV(un); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #if 1 /* * We do not set the uppervp to NULLVP here, because lowervp * may also be NULLVP, so this routine would end up creating * a bogus union node with no upper or lower VP (that causes * pain in many places that assume at least one VP exists). * Since we've removed this node from the cache hash chains, * it won't be found again. When all current holders * release it, union_inactive() will vgone() it. */ union_diruncache(un); #else union_newupper(un, NULLVP); #endif VOP_UNLOCK(vp); mutex_enter(&uhash_lock); if (un->un_cflags & UN_CACHED) { un->un_cflags &= ~UN_CACHED; LIST_REMOVE(un, un_cache); } mutex_exit(&uhash_lock); } #if 0 struct vnode * union_lowervp(struct vnode *vp) { struct union_node *un = VTOUNION(vp); if ((un->un_lowervp != NULLVP) && (vp->v_type == un->un_lowervp->v_type)) { if (vget(un->un_lowervp, 0, true /* wait */) == 0) return (un->un_lowervp); } return (NULLVP); } #endif /* * determine whether a whiteout is needed * during a remove/rmdir operation. */ int union_dowhiteout(struct union_node *un, kauth_cred_t cred) { struct vattr va; if (un->un_lowervp != NULLVP) return (1); if (VOP_GETATTR(un->un_uppervp, &va, cred) == 0 && (va.va_flags & OPAQUE)) return (1); return (0); } static void union_dircache_r(struct vnode *vp, struct vnode ***vppp, int *cntp) { struct union_node *un; if (vp->v_op != union_vnodeop_p) { if (vppp) { vref(vp); *(*vppp)++ = vp; if (--(*cntp) == 0) panic("union: dircache table too small"); } else { (*cntp)++; } return; } un = VTOUNION(vp); if (un->un_uppervp != NULLVP) union_dircache_r(un->un_uppervp, vppp, cntp); if (un->un_lowervp != NULLVP) union_dircache_r(un->un_lowervp, vppp, cntp); } struct vnode * union_dircache(struct vnode *vp, struct lwp *l) { int cnt; struct vnode *nvp = NULLVP; struct vnode **vpp; struct vnode **dircache; int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); dircache = VTOUNION(vp)->un_dircache; nvp = NULLVP; if (dircache == 0) { cnt = 0; union_dircache_r(vp, 0, &cnt); cnt++; dircache = (struct vnode **) malloc(cnt * sizeof(struct vnode *), M_TEMP, M_WAITOK); vpp = dircache; union_dircache_r(vp, &vpp, &cnt); VTOUNION(vp)->un_dircache = dircache; *vpp = NULLVP; vpp = dircache + 1; } else { vpp = dircache; do { if (*vpp++ == VTOUNION(vp)->un_lowervp) break; } while (*vpp != NULLVP); } if (*vpp == NULLVP) goto out; vref(*vpp); error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, 0, NULLVP, *vpp, 0); if (!error) { vn_lock(nvp, LK_EXCLUSIVE | LK_RETRY); VTOUNION(vp)->un_dircache = 0; VTOUNION(nvp)->un_hooknode = true; VTOUNION(nvp)->un_dircache = dircache; } out: VOP_UNLOCK(vp); return (nvp); } void union_diruncache(struct union_node *un) { struct vnode **vpp; KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE); if (un->un_dircache != 0) { for (vpp = un->un_dircache; *vpp != NULLVP; vpp++) vrele(*vpp); free(un->un_dircache, M_TEMP); un->un_dircache = 0; } } /* * Check whether node can rmdir (check empty). */ int union_check_rmdir(struct union_node *un, kauth_cred_t cred) { int dirlen, eofflag, error; char *dirbuf; struct vattr va; struct vnode *tvp; struct dirent *dp, *edp; struct componentname cn; struct iovec aiov; struct uio auio; KASSERT(un->un_uppervp != NULL); /* Check upper for being opaque. */ KASSERT(VOP_ISLOCKED(un->un_uppervp)); error = VOP_GETATTR(un->un_uppervp, &va, cred); if (error || (va.va_flags & OPAQUE)) return error; if (un->un_lowervp == NULL) return 0; /* Check lower for being empty. */ vn_lock(un->un_lowervp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(un->un_lowervp, &va, cred); if (error) { VOP_UNLOCK(un->un_lowervp); return error; } dirlen = va.va_blocksize; dirbuf = kmem_alloc(dirlen, KM_SLEEP); /* error = 0; */ eofflag = 0; auio.uio_offset = 0; do { aiov.iov_len = dirlen; aiov.iov_base = dirbuf; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = aiov.iov_len; auio.uio_rw = UIO_READ; UIO_SETUP_SYSSPACE(&auio); error = VOP_READDIR(un->un_lowervp, &auio, cred, &eofflag, NULL, NULL); if (error) break; edp = (struct dirent *)&dirbuf[dirlen - auio.uio_resid]; for (dp = (struct dirent *)dirbuf; error == 0 && dp < edp; dp = (struct dirent *)((char *)dp + dp->d_reclen)) { if (dp->d_reclen == 0) { error = ENOTEMPTY; break; } if (dp->d_type == DT_WHT || (dp->d_namlen == 1 && dp->d_name[0] == '.') || (dp->d_namlen == 2 && !memcmp(dp->d_name, "..", 2))) continue; /* Check for presence in the upper layer. */ cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN | RDONLY; cn.cn_cred = cred; cn.cn_nameptr = dp->d_name; cn.cn_namelen = dp->d_namlen; error = VOP_LOOKUP(un->un_uppervp, &tvp, &cn); if (error == ENOENT && (cn.cn_flags & ISWHITEOUT)) { error = 0; continue; } if (error == 0) vrele(tvp); error = ENOTEMPTY; } } while (error == 0 && !eofflag); kmem_free(dirbuf, dirlen); VOP_UNLOCK(un->un_lowervp); return error; } /* * This hook is called from vn_readdir() to switch to lower directory * entry after the upper directory is read. */ int union_readdirhook(struct vnode **vpp, struct file *fp, struct lwp *l) { struct vnode *vp = *vpp, *lvp; struct vattr va; int error; if (vp->v_op != union_vnodeop_p) return (0); /* * If the directory is opaque, * then don't show lower entries */ vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &va, fp->f_cred); VOP_UNLOCK(vp); if (error || (va.va_flags & OPAQUE)) return error; if ((lvp = union_dircache(vp, l)) == NULLVP) return (0); error = VOP_OPEN(lvp, FREAD, fp->f_cred); if (error) { vput(lvp); return (error); } VOP_UNLOCK(lvp); fp->f_vnode = lvp; fp->f_offset = 0; error = vn_close(vp, FREAD, fp->f_cred); if (error) return (error); *vpp = lvp; return (0); }
2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 /* $NetBSD: tcp_output.c,v 1.219 2023/09/13 15:54:28 bouyer Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 * * NRL grants permission for redistribution and use in source and binary * forms, with or without modification, of the software and documentation * created at NRL provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgements: * This product includes software developed by the University of * California, Berkeley and its contributors. * This product includes software developed at the Information * Technology Division, US Naval Research Laboratory. * 4. Neither the name of the NRL nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * The views and conclusions contained in the software and documentation * are those of the authors and should not be interpreted as representing * official policies, either expressed or implied, of the US Naval * Research Laboratory (NRL). */ /*- * Copyright (c) 1997, 1998, 2001, 2005, 2006 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation * Facility, NASA Ames Research Center. * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * This code is derived from software contributed to The NetBSD Foundation * by Rui Paulo. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.219 2023/09/13 15:54:28 bouyer Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #include "opt_ipsec.h" #include "opt_tcp_debug.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/errno.h> #include <sys/domain.h> #include <sys/kernel.h> #ifdef TCP_SIGNATURE #include <sys/md5.h> #endif #include <net/if.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> #include <netinet/ip_var.h> #ifdef INET6 #include <netinet/ip6.h> #include <netinet6/in6_var.h> #include <netinet6/ip6_var.h> #include <netinet6/in6_pcb.h> #include <netinet6/nd6.h> #endif #ifdef IPSEC #include <netipsec/ipsec.h> #include <netipsec/key.h> #ifdef INET6 #include <netipsec/ipsec6.h> #endif #endif #include <netinet/tcp.h> #define TCPOUTFLAGS #include <netinet/tcp_fsm.h> #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/tcp_private.h> #include <netinet/tcp_congctl.h> #include <netinet/tcp_debug.h> #include <netinet/in_offload.h> #include <netinet6/in6_offload.h> /* * Knob to enable Congestion Window Monitoring, and control * the burst size it allows. Default burst is 4 packets, per * the Internet draft. */ int tcp_cwm = 0; int tcp_cwm_burstsize = 4; int tcp_do_autosndbuf = 1; int tcp_autosndbuf_inc = 8 * 1024; int tcp_autosndbuf_max = 256 * 1024; #ifdef TCP_OUTPUT_COUNTERS #include <sys/device.h> extern struct evcnt tcp_output_bigheader; extern struct evcnt tcp_output_predict_hit; extern struct evcnt tcp_output_predict_miss; extern struct evcnt tcp_output_copysmall; extern struct evcnt tcp_output_copybig; extern struct evcnt tcp_output_refbig; #define TCP_OUTPUT_COUNTER_INCR(ev) (ev)->ev_count++ #else #define TCP_OUTPUT_COUNTER_INCR(ev) /* nothing */ #endif /* TCP_OUTPUT_COUNTERS */ static int tcp_segsize(struct tcpcb *tp, int *txsegsizep, int *rxsegsizep, bool *alwaysfragp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = NULL; struct rtentry *rt; struct ifnet *ifp; int size; int hdrlen; int optlen; *alwaysfragp = false; size = tcp_mssdflt; switch (tp->t_family) { case AF_INET: hdrlen = sizeof(struct ip) + sizeof(struct tcphdr); break; #ifdef INET6 case AF_INET6: hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); break; #endif default: hdrlen = 1; /* prevent zero sized segments */ goto out; } rt = inpcb_rtentry(inp); so = inp->inp_socket; if (rt == NULL) { goto out; } ifp = rt->rt_ifp; if (tp->t_mtudisc && rt->rt_rmx.rmx_mtu != 0) { #ifdef INET6 if (inp->inp_af == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) { /* * RFC2460 section 5, last paragraph: if path MTU is * smaller than 1280, use 1280 as packet size and * attach fragment header. */ size = IPV6_MMTU - hdrlen - sizeof(struct ip6_frag); *alwaysfragp = true; } else size = rt->rt_rmx.rmx_mtu - hdrlen; #else size = rt->rt_rmx.rmx_mtu - hdrlen; #endif } else if (ifp->if_flags & IFF_LOOPBACK) size = ifp->if_mtu - hdrlen; else if (inp->inp_af == AF_INET && tp->t_mtudisc) size = ifp->if_mtu - hdrlen; else if (inp->inp_af == AF_INET && in_localaddr(in4p_faddr(inp))) size = ifp->if_mtu - hdrlen; #ifdef INET6 else if (inp->inp_af == AF_INET6) { if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) { /* mapped addr case */ struct in_addr d; memcpy(&d, &in6p_faddr(inp).s6_addr32[3], sizeof(d)); if (tp->t_mtudisc || in_localaddr(d)) size = ifp->if_mtu - hdrlen; } else { /* * for IPv6, path MTU discovery is always turned on, * or the node must use packet size <= 1280. */ size = tp->t_mtudisc ? ifp->if_mtu : IPV6_MMTU; size -= hdrlen; } } #endif inpcb_rtentry_unref(rt, inp); out: /* * Now we must make room for whatever extra TCP/IP options are in * the packet. */ optlen = tcp_optlen(tp); /* * XXX tp->t_ourmss should have the right size, but without this code * fragmentation will occur... need more investigation */ if (inp->inp_af == AF_INET) { #if defined(IPSEC) if (ipsec_used && !ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND)) optlen += ipsec4_hdrsiz_tcp(tp); #endif optlen += ip_optlen(inp); } #ifdef INET6 if (inp->inp_af == AF_INET6 && tp->t_family == AF_INET) { #if defined(IPSEC) if (ipsec_used && !ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND)) optlen += ipsec4_hdrsiz_tcp(tp); #endif /* XXX size -= ip_optlen(in6p); */ } else if (inp->inp_af == AF_INET6) { #if defined(IPSEC) if (ipsec_used && !ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND)) optlen += ipsec6_hdrsiz_tcp(tp); #endif optlen += ip6_optlen(inp); } #endif size -= optlen; /* * There may not be any room for data if mtu is too small. This * includes zero-sized. */ if (size <= 0) { return EMSGSIZE; } /* * *rxsegsizep holds *estimated* inbound segment size (estimation * assumes that path MTU is the same for both ways). this is only * for silly window avoidance, do not use the value for other purposes. * * ipseclen is subtracted from both sides, this may not be right. * I'm not quite sure about this (could someone comment). */ *txsegsizep = uimin(tp->t_peermss - optlen, size); *rxsegsizep = uimin(tp->t_ourmss - optlen, size); /* * Never send more than half a buffer full. This insures that we can * always keep 2 packets on the wire, no matter what SO_SNDBUF is, and * therefore acks will never be delayed unless we run out of data to * transmit. */ if (so) { *txsegsizep = uimin(so->so_snd.sb_hiwat >> 1, *txsegsizep); } /* * A segment must at least store header + options */ if (*txsegsizep < hdrlen + optlen) { return EMSGSIZE; } if (*txsegsizep != tp->t_segsz) { /* * If the new segment size is larger, we don't want to * mess up the congestion window, but if it is smaller * we'll have to reduce the congestion window to ensure * that we don't get into trouble with initial windows * and the rest. In any case, if the segment size * has changed, chances are the path has, too, and * our congestion window will be different. */ if (*txsegsizep < tp->t_segsz) { tp->snd_cwnd = uimax((tp->snd_cwnd / tp->t_segsz) * *txsegsizep, *txsegsizep); tp->snd_ssthresh = uimax((tp->snd_ssthresh / tp->t_segsz) * *txsegsizep, *txsegsizep); } tp->t_segsz = *txsegsizep; } return 0; } static int tcp_build_datapkt(struct tcpcb *tp, struct socket *so, int off, long len, int hdrlen, struct mbuf **mp) { struct mbuf *m, *m0; uint64_t *tcps; tcps = TCP_STAT_GETREF(); if (tp->t_force && len == 1) tcps[TCP_STAT_SNDPROBE]++; else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { tp->t_sndrexmitpack++; tcps[TCP_STAT_SNDREXMITPACK]++; tcps[TCP_STAT_SNDREXMITBYTE] += len; } else { tcps[TCP_STAT_SNDPACK]++; tcps[TCP_STAT_SNDBYTE] += len; } TCP_STAT_PUTREF(); MGETHDR(m, M_DONTWAIT, MT_HEADER); if (__predict_false(m == NULL)) return ENOBUFS; MCLAIM(m, &tcp_tx_mowner); /* * XXX Because other code assumes headers will fit in * XXX one header mbuf. * * (This code should almost *never* be run.) */ if (__predict_false((max_linkhdr + hdrlen) > MHLEN)) { TCP_OUTPUT_COUNTER_INCR(&tcp_output_bigheader); MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); return ENOBUFS; } } m->m_data += max_linkhdr; m->m_len = hdrlen; /* * To avoid traversing the whole sb_mb chain for correct * data to send, remember last sent mbuf, its offset and * the sent size. When called the next time, see if the * data to send is directly following the previous transfer. * This is important for large TCP windows. */ if (off == 0 || tp->t_lastm == NULL || (tp->t_lastoff + tp->t_lastlen) != off) { TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_miss); /* * Either a new packet or a retransmit. * Start from the beginning. */ tp->t_lastm = so->so_snd.sb_mb; tp->t_inoff = off; } else { TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_hit); tp->t_inoff += tp->t_lastlen; } /* Traverse forward to next packet */ while (tp->t_inoff > 0) { if (tp->t_lastm == NULL) panic("tp->t_lastm == NULL"); if (tp->t_inoff < tp->t_lastm->m_len) break; tp->t_inoff -= tp->t_lastm->m_len; tp->t_lastm = tp->t_lastm->m_next; } tp->t_lastoff = off; tp->t_lastlen = len; m0 = tp->t_lastm; off = tp->t_inoff; if (len <= M_TRAILINGSPACE(m)) { m_copydata(m0, off, (int)len, mtod(m, char *) + hdrlen); m->m_len += len; TCP_OUTPUT_COUNTER_INCR(&tcp_output_copysmall); } else { m->m_next = m_copym(m0, off, (int)len, M_DONTWAIT); if (m->m_next == NULL) { m_freem(m); return ENOBUFS; } #ifdef TCP_OUTPUT_COUNTERS if (m->m_next->m_flags & M_EXT) TCP_OUTPUT_COUNTER_INCR(&tcp_output_refbig); else TCP_OUTPUT_COUNTER_INCR(&tcp_output_copybig); #endif } *mp = m; return 0; } /* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct rtentry *rt = NULL; struct socket *so; struct route *ro; long len, win; int off, flags, error; struct mbuf *m; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif struct tcphdr *th; u_char opt[MAX_TCPOPTLEN], *optp; #define OPT_FITS(more) ((optlen + (more)) <= sizeof(opt)) unsigned optlen, hdrlen, packetlen; unsigned int sack_numblks; int idle, sendalot, txsegsize, rxsegsize; int txsegsize_nosack; int maxburst = TCP_MAXBURST; int af; /* address family on the wire */ int iphdrlen; int has_tso4, has_tso6; int has_tso, use_tso; bool alwaysfrag; int sack_rxmit; int sack_bytes_rxmt; int ecn_tos; struct sackhole *p; #ifdef TCP_SIGNATURE int sigoff = 0; #endif uint64_t *tcps; so = tp->t_inpcb->inp_socket; ro = &tp->t_inpcb->inp_route; switch (af = tp->t_family) { case AF_INET: case AF_INET6: if (tp->t_inpcb) break; return EINVAL; default: return EAFNOSUPPORT; } if (tcp_segsize(tp, &txsegsize, &rxsegsize, &alwaysfrag)) return EMSGSIZE; idle = (tp->snd_max == tp->snd_una); /* * Determine if we can use TCP segmentation offload: * - If we're using IPv4 * - If there is not an IPsec policy that prevents it * - If the interface can do it */ has_tso4 = has_tso6 = false; has_tso4 = tp->t_inpcb->inp_af == AF_INET && #if defined(IPSEC) (!ipsec_used || ipsec_pcb_skip_ipsec(tp->t_inpcb->inp_sp, IPSEC_DIR_OUTBOUND)) && #endif (rt = rtcache_validate(&tp->t_inpcb->inp_route)) != NULL && (rt->rt_ifp->if_capenable & IFCAP_TSOv4) != 0; if (rt != NULL) { rtcache_unref(rt, &tp->t_inpcb->inp_route); rt = NULL; } #if defined(INET6) has_tso6 = tp->t_inpcb->inp_af == AF_INET6 && #if defined(IPSEC) (!ipsec_used || ipsec_pcb_skip_ipsec(tp->t_inpcb->inp_sp, IPSEC_DIR_OUTBOUND)) && #endif (rt = rtcache_validate(&tp->t_inpcb->inp_route)) != NULL && (rt->rt_ifp->if_capenable & IFCAP_TSOv6) != 0; if (rt != NULL) rtcache_unref(rt, &tp->t_inpcb->inp_route); #endif /* defined(INET6) */ has_tso = (has_tso4 || has_tso6) && !alwaysfrag; /* * Restart Window computation. From draft-floyd-incr-init-win-03: * * Optionally, a TCP MAY set the restart window to the * minimum of the value used for the initial window and * the current value of cwnd (in other words, using a * larger value for the restart window should never increase * the size of cwnd). */ if (tcp_cwm) { /* * Hughes/Touch/Heidemann Congestion Window Monitoring. * Count the number of packets currently pending * acknowledgement, and limit our congestion window * to a pre-determined allowed burst size plus that count. * This prevents bursting once all pending packets have * been acknowledged (i.e. transmission is idle). * * XXX Link this to Initial Window? */ tp->snd_cwnd = uimin(tp->snd_cwnd, (tcp_cwm_burstsize * txsegsize) + (tp->snd_nxt - tp->snd_una)); } else { if (idle && (tcp_now - tp->t_rcvtime) >= tp->t_rxtcur) { /* * We have been idle for "a while" and no acks are * expected to clock out any data we send -- * slow start to get ack "clock" running again. */ int ss = tcp_init_win; if (tp->t_inpcb->inp_af == AF_INET && in_localaddr(in4p_faddr(tp->t_inpcb))) ss = tcp_init_win_local; #ifdef INET6 else if (tp->t_inpcb->inp_af == AF_INET6 && in6_localaddr(&in6p_faddr(tp->t_inpcb))) ss = tcp_init_win_local; #endif tp->snd_cwnd = uimin(tp->snd_cwnd, TCP_INITIAL_WINDOW(ss, txsegsize)); } } txsegsize_nosack = txsegsize; again: ecn_tos = 0; use_tso = has_tso; if ((tp->t_flags & (TF_ECN_SND_CWR|TF_ECN_SND_ECE)) != 0) { /* don't duplicate CWR/ECE. */ use_tso = 0; } TCP_REASS_LOCK(tp); sack_numblks = tcp_sack_numblks(tp); if (sack_numblks) { int sackoptlen; sackoptlen = TCP_SACK_OPTLEN(sack_numblks); if (sackoptlen > txsegsize_nosack) { sack_numblks = 0; /* give up SACK */ txsegsize = txsegsize_nosack; } else { if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) { /* don't duplicate D-SACK. */ use_tso = 0; } txsegsize = txsegsize_nosack - sackoptlen; } } else { txsegsize = txsegsize_nosack; } /* * Determine length of data that should be transmitted, and * flags that should be used. If there is some data or critical * controls (SYN, RST) to send, then transmit; otherwise, * investigate further. * * Readjust SACK information to avoid resending duplicate data. */ if (TCP_SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max)) tcp_sack_adjust(tp); sendalot = 0; off = tp->snd_nxt - tp->snd_una; win = uimin(tp->snd_wnd, tp->snd_cwnd); flags = tcp_outflags[tp->t_state]; /* * Send any SACK-generated retransmissions. If we're explicitly trying * to send out new data (when sendalot is 1), bypass this function. * If we retransmit in fast recovery mode, decrement snd_cwnd, since * we're replacing a (future) new transmission with a retransmission * now, and we previously incremented snd_cwnd in tcp_input(). */ /* * Still in sack recovery, reset rxmit flag to zero. */ sack_rxmit = 0; sack_bytes_rxmt = 0; len = 0; p = NULL; do { long cwin; if (!TCP_SACK_ENABLED(tp)) break; if (tp->t_partialacks < 0) break; p = tcp_sack_output(tp, &sack_bytes_rxmt); if (p == NULL) break; cwin = uimin(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* * (At least) part of sack hole extends beyond * snd_recover. Check to see if we can rexmit data * for this hole. */ if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { /* * Can't rexmit any more data for this hole. * That data will be rexmitted in the next * sack recovery episode, when snd_recover * moves past p->rxmit. */ p = NULL; break; } /* Can rexmit part of the current hole */ len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit)); } else len = ((long)ulmin(cwin, p->end - p->rxmit)); off = p->rxmit - tp->snd_una; if (off + len > so->so_snd.sb_cc) { /* 1 for TH_FIN */ KASSERT(off + len == so->so_snd.sb_cc + 1); KASSERT(p->rxmit + len == tp->snd_max); len = so->so_snd.sb_cc - off; } if (len > 0) { sack_rxmit = 1; sendalot = 1; } } while (/*CONSTCOND*/0); /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_force) { if (win == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unset data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < so->so_snd.sb_cc) flags &= ~TH_FIN; win = 1; } else { TCP_TIMER_DISARM(tp, TCPT_PERSIST); tp->t_rxtshift = 0; } } if (sack_rxmit == 0) { if (TCP_SACK_ENABLED(tp) && tp->t_partialacks >= 0) { long cwin; /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ if (tp->snd_wnd < so->so_snd.sb_cc) { len = tp->snd_wnd - off; flags &= ~TH_FIN; } else { len = so->so_snd.sb_cc - off; } /* * From FreeBSD: * Don't remove this (len > 0) check ! * We explicitly check for len > 0 here (although it * isn't really necessary), to work around a gcc * optimization issue - to force gcc to compute * len above. Without this check, the computation * of len is bungled by the optimizer. */ if (len > 0) { cwin = tp->snd_cwnd - (tp->snd_nxt - tp->sack_newdata) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; if (cwin < len) { len = cwin; flags &= ~TH_FIN; } } } else if (win < so->so_snd.sb_cc) { len = win - off; flags &= ~TH_FIN; } else { len = so->so_snd.sb_cc - off; } } if (len < 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be -1. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. * * If we have a pending FIN, either it has already been * transmitted or it is outside the window, so drop it. * If the FIN has been transmitted, but this is not a * retransmission, then len must be -1. Therefore we also * prevent here the sending of `gratuitous FINs'. This * eliminates the need to check for that case below (e.g. * to back up snd_nxt before the FIN so that the sequence * number is correct). */ len = 0; flags &= ~TH_FIN; if (win == 0) { TCP_TIMER_DISARM(tp, TCPT_REXMT); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) tcp_setpersist(tp); } } /* * Automatic sizing enables the performance of large buffers * and most of the efficiency of small ones by only allocating * space when it is needed. * * The criteria to step up the send buffer one notch are: * 1. receive window of remote host is larger than send buffer * (with a fudge factor of 5/4th); * 2. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); * 3. send buffer fill has not hit maximal automatic size; * 4. our send window (slow start and cogestion controlled) is * larger than sent but unacknowledged data in send buffer. * * The remote host receive window scaling factor may limit the * growing of the send buffer before it reaches its allowed * maximum. * * It scales directly with slow start or congestion window * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given * delay*bandwidth product. However testing has shown this not * to be much of an problem. At worst we are trading wasting * of available bandwidth (the non-use of it) for wasting some * socket buffer memory. * * TODO: Shrink send buffer during idle periods together * with congestion window. Requires another timer. */ if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && so->so_snd.sb_cc < tcp_autosndbuf_max && win >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { if (!sbreserve(&so->so_snd, uimin(so->so_snd.sb_hiwat + tcp_autosndbuf_inc, tcp_autosndbuf_max), so)) so->so_snd.sb_flags &= ~SB_AUTOSIZE; } } if (len > txsegsize) { if (use_tso) { /* * Truncate TSO transfers to IP_MAXPACKET, and make * sure that we send equal size transfers down the * stack (rather than big-small-big-small-...). */ #ifdef INET6 CTASSERT(IPV6_MAXPACKET == IP_MAXPACKET); #endif len = (uimin(len, IP_MAXPACKET) / txsegsize) * txsegsize; if (len <= txsegsize) { use_tso = 0; } } else len = txsegsize; flags &= ~TH_FIN; sendalot = 1; } else use_tso = 0; if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; } win = sbspace(&so->so_rcv); /* * Sender silly window avoidance. If connection is idle * and can send all data, a maximum segment, * at least a maximum default-size segment do it, * or are forced, do it; otherwise don't bother. * If peer's buffer is tiny, then send * when window is at least half open. * If retransmitting (possibly after persist timer forced us * to send into a small window), then must resend. */ if (len) { if (len >= txsegsize) goto send; if ((so->so_state & SS_MORETOCOME) == 0 && ((idle || tp->t_flags & TF_NODELAY) && len + off >= so->so_snd.sb_cc)) goto send; if (tp->t_force) goto send; if (len >= tp->max_sndwnd / 2) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) goto send; if (sack_rxmit) goto send; } /* * Compare available window to amount of window known to peer * (as advertised window less next expected input). If the * difference is at least twice the size of the largest segment * we expect to receive (i.e. two segments) or at least 50% of * the maximum possible window, then want to send a window update * to peer. */ if (win > 0) { /* * "adv" is the amount we can increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long recwin = uimin(win, (long)TCP_MAXWIN << tp->rcv_scale); long oldwin, adv; /* * rcv_nxt may overtake rcv_adv when we accept a * zero-window probe. */ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) oldwin = tp->rcv_adv - tp->rcv_nxt; else oldwin = 0; /* * If the new window size ends up being the same as or * less than the old size when it is scaled, then * don't force a window update. */ if (recwin >> tp->rcv_scale <= oldwin >> tp->rcv_scale) goto dontupdate; adv = recwin - oldwin; if (adv >= (long) (2 * rxsegsize)) goto send; if (2 * adv >= (long) so->so_rcv.sb_hiwat) goto send; } dontupdate: /* * Send if we owe peer an ACK. */ if (tp->t_flags & TF_ACKNOW) goto send; if (flags & (TH_SYN|TH_FIN|TH_RST)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * In SACK, it is possible for tcp_output to fail to send a segment * after the retransmission timer has been turned off. Make sure * that the retransmission timer is set. */ if (TCP_SACK_ENABLED(tp) && SEQ_GT(tp->snd_max, tp->snd_una) && !TCP_TIMER_ISARMED(tp, TCPT_REXMT) && !TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) { TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); goto just_return; } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tp->t_timer[TCPT_PERSIST] * is set when we are in persist state. * tp->t_force * is set when we are called to send a persist packet. * tp->t_timer[TCPT_REXMT] * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 && TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ just_return: TCP_REASS_UNLOCK(tp); return 0; send: /* * Before ESTABLISHED, force sending of initial options unless TCP set * not to do any options. * * Note: we assume that the IP/TCP header plus TCP options always fit * in a single mbuf, leaving room for a maximum link header, i.e.: * max_linkhdr + IP_header + TCP_header + optlen <= MCLBYTES */ optlen = 0; optp = opt; switch (af) { case AF_INET: iphdrlen = sizeof(struct ip) + sizeof(struct tcphdr); break; #ifdef INET6 case AF_INET6: iphdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); break; #endif default: /*pacify gcc*/ iphdrlen = 0; break; } hdrlen = iphdrlen; if (flags & TH_SYN) { struct rtentry *synrt; synrt = inpcb_rtentry(tp->t_inpcb); tp->snd_nxt = tp->iss; tp->t_ourmss = tcp_mss_to_advertise(synrt != NULL ? synrt->rt_ifp : NULL, af); inpcb_rtentry_unref(synrt, tp->t_inpcb); if ((tp->t_flags & TF_NOOPT) == 0 && OPT_FITS(TCPOLEN_MAXSEG)) { *optp++ = TCPOPT_MAXSEG; *optp++ = TCPOLEN_MAXSEG; *optp++ = (tp->t_ourmss >> 8) & 0xff; *optp++ = tp->t_ourmss & 0xff; optlen += TCPOLEN_MAXSEG; if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_SCALE)) && OPT_FITS(TCPOLEN_WINDOW + TCPOLEN_NOP)) { *((uint32_t *)optp) = htonl( TCPOPT_NOP << 24 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | tp->request_r_scale); optp += TCPOLEN_WINDOW + TCPOLEN_NOP; optlen += TCPOLEN_WINDOW + TCPOLEN_NOP; } if (tcp_do_sack && OPT_FITS(TCPOLEN_SACK_PERMITTED)) { *optp++ = TCPOPT_SACK_PERMITTED; *optp++ = TCPOLEN_SACK_PERMITTED; optlen += TCPOLEN_SACK_PERMITTED; } } } /* * Send a timestamp and echo-reply if this is a SYN and our side * wants to use timestamps (TF_REQ_TSTMP is set) or both our side * and our peer have sent timestamps in our SYN's. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (flags & TH_RST) == 0 && ((flags & (TH_SYN|TH_ACK)) == TH_SYN || (tp->t_flags & TF_RCVD_TSTMP))) { int alen = 0; while (optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; alen++; } if (OPT_FITS(TCPOLEN_TIMESTAMP)) { *optp++ = TCPOPT_TIMESTAMP; *optp++ = TCPOLEN_TIMESTAMP; uint32_t *lp = (uint32_t *)optp; /* Form timestamp option (appendix A of RFC 1323) */ *lp++ = htonl(TCP_TIMESTAMP(tp)); *lp = htonl(tp->ts_recent); optp += TCPOLEN_TIMESTAMP - 2; optlen += TCPOLEN_TIMESTAMP; /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = TCP_TIMESTAMP(tp); } else { optp -= alen; optlen -= alen; } } #ifdef TCP_SIGNATURE if (tp->t_flags & TF_SIGNATURE) { /* * Initialize TCP-MD5 option (RFC2385) */ if (!OPT_FITS(TCPOLEN_SIGNATURE)) goto reset; *optp++ = TCPOPT_SIGNATURE; *optp++ = TCPOLEN_SIGNATURE; sigoff = optlen + 2; memset(optp, 0, TCP_SIGLEN); optlen += TCPOLEN_SIGNATURE; optp += TCP_SIGLEN; } #endif /* * Tack on the SACK block if it is necessary. */ if (sack_numblks) { int alen = 0; int sack_len = sack_numblks * 8; while (optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; alen++; } if (OPT_FITS(sack_len + 2)) { struct ipqent *tiqe; *optp++ = TCPOPT_SACK; *optp++ = sack_len + 2; uint32_t *lp = (uint32_t *)optp; if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) { sack_numblks--; *lp++ = htonl(tp->rcv_dsack_block.left); *lp++ = htonl(tp->rcv_dsack_block.right); tp->rcv_sack_flags &= ~TCPSACK_HAVED; } for (tiqe = TAILQ_FIRST(&tp->timeq); sack_numblks > 0; tiqe = TAILQ_NEXT(tiqe, ipqe_timeq)) { KASSERT(tiqe != NULL); sack_numblks--; *lp++ = htonl(tiqe->ipqe_seq); *lp++ = htonl(tiqe->ipqe_seq + tiqe->ipqe_len + ((tiqe->ipqe_flags & TH_FIN) != 0 ? 1 : 0)); } optlen += sack_len + 2; optp += sack_len; } else { optp -= alen; optlen -= alen; } } /* Terminate and pad TCP options to a 4 byte boundary. */ if (optlen % 4) { if (!OPT_FITS(TCPOLEN_EOL)) { reset: TCP_REASS_UNLOCK(tp); error = ECONNABORTED; goto out; } optlen += TCPOLEN_EOL; *optp++ = TCPOPT_EOL; } /* * According to RFC 793 (STD0007): * "The content of the header beyond the End-of-Option option * must be header padding (i.e., zero)." * and later: "The padding is composed of zeros." */ while (optlen % 4) { if (!OPT_FITS(TCPOLEN_PAD)) goto reset; optlen += TCPOLEN_PAD; *optp++ = TCPOPT_PAD; } TCP_REASS_UNLOCK(tp); hdrlen += optlen; #ifdef DIAGNOSTIC if (!use_tso && len > txsegsize) panic("tcp data to be sent is larger than segment"); else if (use_tso && len > IP_MAXPACKET) panic("tcp data to be sent is larger than max TSO size"); if (max_linkhdr + hdrlen > MCLBYTES) panic("tcphdr too big"); #endif /* * Grab a header mbuf, attaching a copy of data to * be transmitted, and initialize the header from * the template for sends on this connection. */ if (len) { error = tcp_build_datapkt(tp, so, off, len, hdrlen, &m); if (error) goto out; /* * If we're sending everything we've got, set PUSH. * (This will keep happy those implementations which only * give data to the user when a buffer fills or * a PUSH comes in.) */ if (off + len == so->so_snd.sb_cc) flags |= TH_PUSH; } else { tcps = TCP_STAT_GETREF(); if (tp->t_flags & TF_ACKNOW) tcps[TCP_STAT_SNDACKS]++; else if (flags & (TH_SYN|TH_FIN|TH_RST)) tcps[TCP_STAT_SNDCTRL]++; else if (SEQ_GT(tp->snd_up, tp->snd_una)) tcps[TCP_STAT_SNDURG]++; else tcps[TCP_STAT_SNDWINUP]++; TCP_STAT_PUTREF(); MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m != NULL && max_linkhdr + hdrlen > MHLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m = NULL; } } if (m == NULL) { error = ENOBUFS; goto out; } MCLAIM(m, &tcp_tx_mowner); m->m_data += max_linkhdr; m->m_len = hdrlen; } m_reset_rcvif(m); switch (af) { case AF_INET: ip = mtod(m, struct ip *); #ifdef INET6 ip6 = NULL; #endif th = (struct tcphdr *)(ip + 1); break; #ifdef INET6 case AF_INET6: ip = NULL; ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); break; #endif default: /*pacify gcc*/ ip = NULL; #ifdef INET6 ip6 = NULL; #endif th = NULL; break; } if (tp->t_template == NULL) panic("%s: no template", __func__); if (tp->t_template->m_len < iphdrlen) panic("%s: %d < %d", __func__, tp->t_template->m_len, iphdrlen); bcopy(mtod(tp->t_template, void *), mtod(m, void *), iphdrlen); /* * If we are starting a connection, send ECN setup * SYN packet. If we are on a retransmit, we may * resend those bits a number of times as per * RFC 3168. */ if (tp->t_state == TCPS_SYN_SENT && tcp_do_ecn) { if (tp->t_flags & TF_SYN_REXMT) { if (tp->t_ecn_retries--) flags |= TH_ECE|TH_CWR; } else { flags |= TH_ECE|TH_CWR; tp->t_ecn_retries = tcp_ecn_maxretries; } } if (TCP_ECN_ALLOWED(tp)) { /* * If the peer has ECN, mark data packets * ECN capable. Ignore pure ack packets, retransmissions * and window probes. */ if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && !(tp->t_force && len == 1)) { ecn_tos = IPTOS_ECN_ECT0; TCP_STATINC(TCP_STAT_ECN_ECT); } /* * Reply with proper ECN notifications. */ if (tp->t_flags & TF_ECN_SND_CWR) { flags |= TH_CWR; tp->t_flags &= ~TF_ECN_SND_CWR; } if (tp->t_flags & TF_ECN_SND_ECE) { flags |= TH_ECE; } } /* * If we are doing retransmissions, then snd_nxt will * not reflect the first unsent octet. For ACK only * packets, we do not want the sequence number of the * retransmitted packet, we want the sequence number * of the next unsent octet. So, if there is no data * (and no SYN or FIN), use snd_max instead of snd_nxt * when filling in ti_seq. But if we are in persist * state, snd_max might reflect one byte beyond the * right edge of the window, so use snd_nxt in that * case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (TCP_SACK_ENABLED(tp) && sack_rxmit) { th->th_seq = htonl(p->rxmit); p->rxmit += len; } else { if (len || (flags & (TH_SYN|TH_FIN)) || TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { memcpy(th + 1, opt, optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } th->th_flags = flags; /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. */ if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)rxsegsize) win = 0; if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt)) win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt); th->th_win = htons((u_int16_t) (win>>tp->rcv_scale)); if (th->th_win == 0) { tp->t_sndzerowin++; } if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { u_int32_t urp = tp->snd_up - tp->snd_nxt; if (urp > IP_MAXPACKET) urp = IP_MAXPACKET; th->th_urp = htons((u_int16_t)urp); th->th_flags |= TH_URG; } else /* * If no urgent pointer to send, then we pull * the urgent pointer to the left edge of the send window * so that it doesn't drift into the send window on sequence * number wraparound. */ tp->snd_up = tp->snd_una; /* drag it along */ #ifdef TCP_SIGNATURE if (sigoff && (tp->t_flags & TF_SIGNATURE)) { struct secasvar *sav; u_int8_t *sigp; sav = tcp_signature_getsav(m); if (sav == NULL) { if (m) m_freem(m); return EPERM; } m->m_pkthdr.len = hdrlen + len; sigp = (char *)th + sizeof(*th) + sigoff; tcp_signature(m, th, (char *)th - mtod(m, char *), sav, sigp); key_sa_recordxfer(sav, m); KEY_SA_UNREF(&sav); } #endif /* * Set ourselves up to be checksummed just before the packet * hits the wire. */ switch (af) { case AF_INET: m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); if (use_tso) { m->m_pkthdr.segsz = txsegsize; m->m_pkthdr.csum_flags = M_CSUM_TSOv4; } else { m->m_pkthdr.csum_flags = M_CSUM_TCPv4; if (len + optlen) { /* Fixup the pseudo-header checksum. */ /* XXXJRT Not IP Jumbogram safe. */ th->th_sum = in_cksum_addword(th->th_sum, htons((u_int16_t) (len + optlen))); } } break; #ifdef INET6 case AF_INET6: m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); if (use_tso) { m->m_pkthdr.segsz = txsegsize; m->m_pkthdr.csum_flags = M_CSUM_TSOv6; } else { m->m_pkthdr.csum_flags = M_CSUM_TCPv6; if (len + optlen) { /* Fixup the pseudo-header checksum. */ /* XXXJRT: Not IPv6 Jumbogram safe. */ th->th_sum = in_cksum_addword(th->th_sum, htons((u_int16_t) (len + optlen))); } } break; #endif } /* * In transmit state, time the transmission and arrange for * the retransmit. In persist state, just set snd_max. */ if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { tcp_seq startseq = tp->snd_nxt; /* * Advance snd_nxt over sequence space of this segment. * There are no states in which we send both a SYN and a FIN, * so we collapse the tests for these flags. */ if (flags & (TH_SYN|TH_FIN)) tp->snd_nxt++; if (sack_rxmit) goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; /* * Time this transmission if not a retransmission and * not currently timing anything. */ if (tp->t_rtttime == 0) { tp->t_rtttime = tcp_now; tp->t_rtseq = startseq; TCP_STATINC(TCP_STAT_SEGSTIMED); } } /* * Set retransmit timer if not currently set, * and not doing an ack or a keep-alive probe. * Initial value for retransmit timer is smoothed * round-trip time + 2 * round-trip time variance. * Initialize shift counter which is used for backoff * of retransmit time. */ timer: if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) { if ((sack_rxmit && tp->snd_nxt != tp->snd_max) || tp->snd_nxt != tp->snd_una) { if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) { TCP_TIMER_DISARM(tp, TCPT_PERSIST); tp->t_rxtshift = 0; } TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); } else if (len == 0 && so->so_snd.sb_cc > 0 && TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { /* * If we are sending a window probe and there's * unacked data in the socket, make sure at * least the persist timer is running. */ tp->t_rxtshift = 0; tcp_setpersist(tp); } } } else if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) tp->snd_max = tp->snd_nxt + len; #ifdef TCP_DEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) tcp_trace(TA_OUTPUT, tp->t_state, tp, m, 0); #endif /* * Fill in IP length and desired time to live and * send to IP level. There should be a better way * to handle ttl and tos; we could keep them in * the template, but need a way to checksum without them. */ m->m_pkthdr.len = hdrlen + len; switch (af) { case AF_INET: ip->ip_len = htons(m->m_pkthdr.len); packetlen = m->m_pkthdr.len; if (tp->t_inpcb->inp_af == AF_INET) { ip->ip_ttl = in4p_ip(tp->t_inpcb).ip_ttl; ip->ip_tos = in4p_ip(tp->t_inpcb).ip_tos | ecn_tos; } #ifdef INET6 else if (tp->t_inpcb->inp_af == AF_INET6) { ip->ip_ttl = in6pcb_selecthlim(tp->t_inpcb, NULL); /*XXX*/ ip->ip_tos = ecn_tos; /*XXX*/ } #endif break; #ifdef INET6 case AF_INET6: packetlen = m->m_pkthdr.len; ip6->ip6_nxt = IPPROTO_TCP; if (tp->t_family == AF_INET6) { /* * we separately set hoplimit for every segment, since * the user might want to change the value via * setsockopt. Also, desired default hop limit might * be changed via Neighbor Discovery. */ ip6->ip6_hlim = in6pcb_selecthlim_rt(tp->t_inpcb); } ip6->ip6_flow |= htonl(ecn_tos << 20); /* ip6->ip6_flow = ??? (from template) */ /* ip6_plen will be filled in ip6_output(). */ break; #endif default: /*pacify gcc*/ packetlen = 0; break; } switch (af) { case AF_INET: { struct mbuf *opts; if (tp->t_inpcb->inp_af == AF_INET) opts = tp->t_inpcb->inp_options; else opts = NULL; error = ip_output(m, opts, ro, (tp->t_mtudisc ? IP_MTUDISC : 0) | (so->so_options & SO_DONTROUTE), NULL, tp->t_inpcb); break; } #ifdef INET6 case AF_INET6: { struct ip6_pktopts *opts; if (tp->t_inpcb->inp_af == AF_INET6) opts = in6p_outputopts(tp->t_inpcb); else opts = NULL; error = ip6_output(m, opts, ro, so->so_options & SO_DONTROUTE, NULL, tp->t_inpcb, NULL); break; } #endif default: error = EAFNOSUPPORT; break; } if (error) { out: if (error == ENOBUFS) { TCP_STATINC(TCP_STAT_SELFQUENCH); tcp_quench(tp->t_inpcb); error = 0; } else if ((error == EHOSTUNREACH || error == ENETDOWN || error == EHOSTDOWN) && TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; error = 0; } /* Back out the sequence number advance. */ if (sack_rxmit) p->rxmit -= len; /* Restart the delayed ACK timer, if necessary. */ if (tp->t_flags & TF_DELACK) TCP_RESTART_DELACK(tp); return error; } if (packetlen > tp->t_pmtud_mtu_sent) tp->t_pmtud_mtu_sent = packetlen; tcps = TCP_STAT_GETREF(); tcps[TCP_STAT_SNDTOTAL]++; if (tp->t_flags & TF_DELACK) tcps[TCP_STAT_DELACK]++; TCP_STAT_PUTREF(); /* * Data sent (as far as we can tell). * If this advertises a larger window than any other segment, * then remember the size of the advertised window. * Any pending ACK has now been sent. */ if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + win; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~TF_ACKNOW; TCP_CLEAR_DELACK(tp); #ifdef DIAGNOSTIC if (maxburst < 0) printf("tcp_output: maxburst exceeded by %d\n", -maxburst); #endif if (sendalot && (tp->t_congctl == &tcp_reno_ctl || --maxburst)) goto again; return 0; } void tcp_setpersist(struct tcpcb *tp) { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2); int nticks; if (TCP_TIMER_ISARMED(tp, TCPT_REXMT)) panic("tcp_output REXMT"); /* * Start/restart persistance timer. */ if (t < tp->t_rttmin) t = tp->t_rttmin; TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift], TCPTV_PERSMIN, TCPTV_PERSMAX); TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; }
32 28 3 33 3 3 1 3 14 11 14 14 83 84 85 75 75 76 18 18 18 76 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 /* $NetBSD: uvm_object.c,v 1.25 2020/08/15 07:24:09 chs Exp $ */ /* * Copyright (c) 2006, 2010, 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * uvm_object.c: operate with memory objects * * TODO: * 1. Support PG_RELEASED-using objects */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_object.c,v 1.25 2020/08/15 07:24:09 chs Exp $"); #ifdef _KERNEL_OPT #include "opt_ddb.h" #endif #include <sys/param.h> #include <sys/rwlock.h> #include <sys/queue.h> #include <uvm/uvm.h> #include <uvm/uvm_ddb.h> #include <uvm/uvm_page_array.h> /* Page count to fetch per single step. */ #define FETCH_PAGECOUNT 16 /* * uvm_obj_init: initialize UVM memory object. */ void uvm_obj_init(struct uvm_object *uo, const struct uvm_pagerops *ops, bool alock, u_int refs) { #if 0 /* notyet */ KASSERT(ops); #endif if (alock) { /* Allocate and assign a lock. */ uo->vmobjlock = rw_obj_alloc(); } else { /* The lock will need to be set via uvm_obj_setlock(). */ uo->vmobjlock = NULL; } uo->pgops = ops; LIST_INIT(&uo->uo_ubc); uo->uo_npages = 0; uo->uo_refs = refs; radix_tree_init_tree(&uo->uo_pages); } /* * uvm_obj_destroy: destroy UVM memory object. */ void uvm_obj_destroy(struct uvm_object *uo, bool dlock) { KASSERT(radix_tree_empty_tree_p(&uo->uo_pages)); /* Purge any UBC entries associated with this object. */ ubc_purge(uo); /* Destroy the lock, if requested. */ if (dlock) { rw_obj_free(uo->vmobjlock); } radix_tree_fini_tree(&uo->uo_pages); } /* * uvm_obj_setlock: assign a vmobjlock to the UVM object. * * => Caller is responsible to ensure that UVM objects is not use. * => Only dynamic lock may be previously set. We drop the reference then. */ void uvm_obj_setlock(struct uvm_object *uo, krwlock_t *lockptr) { krwlock_t *olockptr = uo->vmobjlock; if (olockptr) { /* Drop the reference on the old lock. */ rw_obj_free(olockptr); } if (lockptr == NULL) { /* If new lock is not passed - allocate default one. */ lockptr = rw_obj_alloc(); } uo->vmobjlock = lockptr; } /* * uvm_obj_wirepages: wire the pages of entire UVM object. * * => NOTE: this function should only be used for types of objects * where PG_RELEASED flag is never set (aobj objects) * => caller must pass page-aligned start and end values */ int uvm_obj_wirepages(struct uvm_object *uobj, off_t start, off_t end, struct pglist *list) { int i, npages, error; struct vm_page *pgs[FETCH_PAGECOUNT], *pg = NULL; off_t offset = start, left; left = (end - start) >> PAGE_SHIFT; rw_enter(uobj->vmobjlock, RW_WRITER); while (left) { npages = MIN(FETCH_PAGECOUNT, left); /* Get the pages */ memset(pgs, 0, sizeof(pgs)); error = (*uobj->pgops->pgo_get)(uobj, offset, pgs, &npages, 0, VM_PROT_READ | VM_PROT_WRITE, UVM_ADV_SEQUENTIAL, PGO_SYNCIO); if (error) goto error; rw_enter(uobj->vmobjlock, RW_WRITER); for (i = 0; i < npages; i++) { KASSERT(pgs[i] != NULL); KASSERT(!(pgs[i]->flags & PG_RELEASED)); /* * Loan break */ if (pgs[i]->loan_count) { while (pgs[i]->loan_count) { pg = uvm_loanbreak(pgs[i]); if (!pg) { rw_exit(uobj->vmobjlock); uvm_wait("uobjwirepg"); rw_enter(uobj->vmobjlock, RW_WRITER); continue; } } pgs[i] = pg; } if (pgs[i]->flags & PG_AOBJ) { uvm_pagemarkdirty(pgs[i], UVM_PAGE_STATUS_DIRTY); uao_dropswap(uobj, i); } } /* Wire the pages */ for (i = 0; i < npages; i++) { uvm_pagelock(pgs[i]); uvm_pagewire(pgs[i]); uvm_pageunlock(pgs[i]); if (list != NULL) TAILQ_INSERT_TAIL(list, pgs[i], pageq.queue); } /* Unbusy the pages */ uvm_page_unbusy(pgs, npages); left -= npages; offset += npages << PAGE_SHIFT; } rw_exit(uobj->vmobjlock); return 0; error: /* Unwire the pages which has been wired */ uvm_obj_unwirepages(uobj, start, offset); return error; } /* * uvm_obj_unwirepages: unwire the pages of entire UVM object. * * => NOTE: this function should only be used for types of objects * where PG_RELEASED flag is never set * => caller must pass page-aligned start and end values */ void uvm_obj_unwirepages(struct uvm_object *uobj, off_t start, off_t end) { struct vm_page *pg; off_t offset; rw_enter(uobj->vmobjlock, RW_WRITER); for (offset = start; offset < end; offset += PAGE_SIZE) { pg = uvm_pagelookup(uobj, offset); KASSERT(pg != NULL); KASSERT(!(pg->flags & PG_RELEASED)); uvm_pagelock(pg); uvm_pageunwire(pg); uvm_pageunlock(pg); } rw_exit(uobj->vmobjlock); } static inline bool uvm_obj_notag_p(struct uvm_object *uobj, int tag) { KASSERT(rw_lock_held(uobj->vmobjlock)); return radix_tree_empty_tagged_tree_p(&uobj->uo_pages, tag); } bool uvm_obj_clean_p(struct uvm_object *uobj) { return uvm_obj_notag_p(uobj, UVM_PAGE_DIRTY_TAG); } bool uvm_obj_nowriteback_p(struct uvm_object *uobj) { return uvm_obj_notag_p(uobj, UVM_PAGE_WRITEBACK_TAG); } static inline bool uvm_obj_page_tag_p(struct vm_page *pg, int tag) { struct uvm_object *uobj = pg->uobject; uint64_t pgidx = pg->offset >> PAGE_SHIFT; KASSERT(uobj != NULL); KASSERT(rw_lock_held(uobj->vmobjlock)); return radix_tree_get_tag(&uobj->uo_pages, pgidx, tag) != 0; } static inline void uvm_obj_page_set_tag(struct vm_page *pg, int tag) { struct uvm_object *uobj = pg->uobject; uint64_t pgidx = pg->offset >> PAGE_SHIFT; KASSERT(uobj != NULL); KASSERT(rw_write_held(uobj->vmobjlock)); radix_tree_set_tag(&uobj->uo_pages, pgidx, tag); } static inline void uvm_obj_page_clear_tag(struct vm_page *pg, int tag) { struct uvm_object *uobj = pg->uobject; uint64_t pgidx = pg->offset >> PAGE_SHIFT; KASSERT(uobj != NULL); KASSERT(rw_write_held(uobj->vmobjlock)); radix_tree_clear_tag(&uobj->uo_pages, pgidx, tag); } bool uvm_obj_page_dirty_p(struct vm_page *pg) { return uvm_obj_page_tag_p(pg, UVM_PAGE_DIRTY_TAG); } void uvm_obj_page_set_dirty(struct vm_page *pg) { uvm_obj_page_set_tag(pg, UVM_PAGE_DIRTY_TAG); } void uvm_obj_page_clear_dirty(struct vm_page *pg) { uvm_obj_page_clear_tag(pg, UVM_PAGE_DIRTY_TAG); } bool uvm_obj_page_writeback_p(struct vm_page *pg) { return uvm_obj_page_tag_p(pg, UVM_PAGE_WRITEBACK_TAG); } void uvm_obj_page_set_writeback(struct vm_page *pg) { uvm_obj_page_set_tag(pg, UVM_PAGE_WRITEBACK_TAG); } void uvm_obj_page_clear_writeback(struct vm_page *pg) { uvm_obj_page_clear_tag(pg, UVM_PAGE_WRITEBACK_TAG); } #if defined(DDB) || defined(DEBUGPRINT) /* * uvm_object_printit: actually prints the object */ void uvm_object_printit(struct uvm_object *uobj, bool full, void (*pr)(const char *, ...)) { struct uvm_page_array a; struct vm_page *pg; int cnt = 0; voff_t off; (*pr)("OBJECT %p: locked=%d, pgops=%p, npages=%d, ", uobj, rw_write_held(uobj->vmobjlock), uobj->pgops, uobj->uo_npages); if (UVM_OBJ_IS_KERN_OBJECT(uobj)) (*pr)("refs=<SYSTEM>\n"); else (*pr)("refs=%d\n", uobj->uo_refs); if (!full) { return; } (*pr)(" PAGES <pg,offset>:\n "); uvm_page_array_init(&a, uobj, 0); off = 0; while ((pg = uvm_page_array_fill_and_peek(&a, off, 0)) != NULL) { cnt++; (*pr)("<%p,0x%llx> ", pg, (long long)pg->offset); if ((cnt % 3) == 0) { (*pr)("\n "); } off = pg->offset + PAGE_SIZE; uvm_page_array_advance(&a); } if ((cnt % 3) != 0) { (*pr)("\n"); } uvm_page_array_fini(&a); } #endif /* DDB || DEBUGPRINT */
92 91 91 90 91 92 47 95 96 92 91 90 88 92 3 3 90 92 90 91 89 92 91 92 47 46 47 47 46 47 47 45 3 3 3 3 3 3 3 3 3 3 3 238 233 238 236 110 102 22 237 238 182 230 229 232 239 235 236 236 237 236 181 234 229 235 180 238 232 235 236 238 233 182 183 184 181 181 183 235 230 232 235 181 181 233 232 243 241 243 245 122 245 232 238 240 135 132 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 /* $NetBSD: subr_lockdebug.c,v 1.83 2022/09/02 06:01:38 nakayama Exp $ */ /*- * Copyright (c) 2006, 2007, 2008, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Basic lock debugging code shared among lock primitives. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_lockdebug.c,v 1.83 2022/09/02 06:01:38 nakayama Exp $"); #ifdef _KERNEL_OPT #include "opt_ddb.h" #endif #include <sys/param.h> #include <sys/proc.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/kmem.h> #include <sys/lockdebug.h> #include <sys/sleepq.h> #include <sys/cpu.h> #include <sys/atomic.h> #include <sys/lock.h> #include <sys/rbtree.h> #include <sys/ksyms.h> #include <sys/kcov.h> #include <machine/lock.h> #ifdef DDB #include <machine/db_machdep.h> #include <ddb/db_interface.h> #include <ddb/db_access.h> #include <ddb/db_sym.h> #endif unsigned int ld_panic; #ifdef LOCKDEBUG #ifdef __ia64__ #define LD_BATCH_SHIFT 16 #else #define LD_BATCH_SHIFT 9 #endif #define LD_BATCH (1 << LD_BATCH_SHIFT) #define LD_BATCH_MASK (LD_BATCH - 1) #define LD_MAX_LOCKS 1048576 #define LD_SLOP 16 #define LD_LOCKED 0x01 #define LD_SLEEPER 0x02 #define LD_WRITE_LOCK 0x80000000 typedef struct lockdebug { struct rb_node ld_rb_node; __cpu_simple_lock_t ld_spinlock; _TAILQ_ENTRY(struct lockdebug, volatile) ld_chain; _TAILQ_ENTRY(struct lockdebug, volatile) ld_achain; volatile void *ld_lo